diff options
Diffstat (limited to 'src/gallium/drivers/vc4')
43 files changed, 2324 insertions, 1528 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am index 3f62ce21a9f..f4a57ba3404 100644 --- a/src/gallium/drivers/vc4/Makefile.am +++ b/src/gallium/drivers/vc4/Makefile.am @@ -19,8 +19,6 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc @@ -30,10 +28,10 @@ SIM_LDFLAGS = -lsimpenrose endif AM_CFLAGS = \ + -I$(top_builddir)/src/glsl/nir \ $(LIBDRM_CFLAGS) \ $(GALLIUM_DRIVER_CFLAGS) \ $(SIM_CFLAGS) \ - -I$(top_srcdir)/src/mesa/ \ $() noinst_LTLIBRARIES = libvc4.la diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 1eb029e67e7..6fb40c20562 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -19,6 +19,8 @@ C_SOURCES := \ vc4_fence.c \ vc4_formats.c \ vc4_job.c \ + vc4_nir_lower_blend.c \ + vc4_nir_lower_io.c \ vc4_opt_algebraic.c \ vc4_opt_constant_folding.c \ vc4_opt_copy_propagation.c \ @@ -49,4 +51,5 @@ C_SOURCES := \ vc4_state.c \ vc4_tiling.c \ vc4_tiling.h \ + vc4_uniforms.c \ $() diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h index 1fd8aa9fb28..ffc973735ae 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_drv.h +++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h @@ -26,17 +26,6 @@ #include "vc4_simulator_validate.h" -enum vc4_bo_mode { - VC4_MODE_UNDECIDED, - VC4_MODE_RENDER, - VC4_MODE_SHADER, -}; - -struct vc4_bo_exec_state { - struct drm_gem_cma_object *bo; - enum vc4_bo_mode mode; -}; - struct vc4_exec_info { /* Sequence number for this bin/render job. */ uint64_t seqno; @@ -47,7 +36,7 @@ struct vc4_exec_info { /* This is the array of BOs that were looked up at the start of exec. * Command validation will use indices into this array. */ - struct vc4_bo_exec_state *bo; + struct drm_gem_cma_object **bo; uint32_t bo_count; /* List of other BOs used in the job that need to be released @@ -72,7 +61,6 @@ struct vc4_exec_info { * command lists. */ struct vc4_shader_state { - uint8_t packet; uint32_t addr; /* Maximum vertex index referenced by any primitive using this * shader state. @@ -88,6 +76,7 @@ struct vc4_exec_info { bool found_tile_binning_mode_config_packet; bool found_start_tile_binning_packet; bool found_increment_semaphore_packet; + bool found_flush; uint8_t bin_tiles_x, bin_tiles_y; struct drm_gem_cma_object *tile_bo; uint32_t tile_alloc_offset; @@ -99,6 +88,9 @@ struct vc4_exec_info { uint32_t ct0ca, ct0ea; uint32_t ct1ca, ct1ea; + /* Pointer to the unvalidated bin CL (if present). */ + void *bin_u; + /* Pointers to the shader recs. These paddr gets incremented as CL * packets are relocated in validate_gl_shader_state, and the vaddrs * (u and v) get incremented and size decremented as the shader recs @@ -168,10 +160,8 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec); struct vc4_validated_shader_info * vc4_validate_shader(struct drm_gem_cma_object *shader_obj); -bool vc4_use_bo(struct vc4_exec_info *exec, - uint32_t hindex, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj); +struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec, + uint32_t hindex); int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec); diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c index e4b7fea5968..93f9ec7ed9b 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_gem.c +++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c @@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec) exec->ct0ca = exec->exec_bo->paddr + bin_offset; + exec->bin_u = bin; + exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset; exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset; exec->shader_rec_size = args->shader_rec_size; diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h index 88cfc0fa9f0..771e2b78761 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_packet.h +++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h @@ -88,16 +88,22 @@ enum vc4_packet { #define VC4_PACKET_START_TILE_BINNING_SIZE 1 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE 1 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE 1 +#define VC4_PACKET_BRANCH_SIZE 5 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE 5 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE 1 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE 1 +#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE 5 +#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE 5 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE 7 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE 7 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE 14 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE 10 +#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE 1 +#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE 1 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE 2 #define VC4_PACKET_GL_SHADER_STATE_SIZE 5 #define VC4_PACKET_NV_SHADER_STATE_SIZE 5 +#define VC4_PACKET_VG_SHADER_STATE_SIZE 5 #define VC4_PACKET_CONFIGURATION_BITS_SIZE 4 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE 5 #define VC4_PACKET_POINT_SIZE_SIZE 5 @@ -106,6 +112,7 @@ enum vc4_packet { #define VC4_PACKET_DEPTH_OFFSET_SIZE 5 #define VC4_PACKET_CLIP_WINDOW_SIZE 9 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE 5 +#define VC4_PACKET_Z_CLIPPING_SIZE 9 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE 9 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE 9 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE 16 @@ -136,6 +143,16 @@ enum vc4_packet { /** @{ * + * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and + * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER. + */ +#define VC4_LOADSTORE_FULL_RES_EOF (1 << 3) +#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL (1 << 2) +#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS (1 << 1) +#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR (1 << 0) + +/** @{ + * * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address) */ diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c index e2d907ad91f..b827eb7e9e1 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c +++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c @@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec, struct vc4_rcl_setup *setup, uint8_t x, uint8_t y, bool first, bool last) { - bool has_bin = exec->args->bin_cl_size != 0; + struct drm_vc4_submit_cl *args = exec->args; + bool has_bin = args->bin_cl_size != 0; /* Note that the load doesn't actually occur until the * tile coords packet is processed, and only one load @@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec, */ if (setup->color_read) { rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->color_read.bits); + rcl_u16(setup, args->color_read.bits); rcl_u32(setup, - setup->color_read->paddr + - exec->args->color_read.offset); + setup->color_read->paddr + args->color_read.offset); } if (setup->zs_read) { @@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec, } rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->zs_read.bits); - rcl_u32(setup, - setup->zs_read->paddr + exec->args->zs_read.offset); + rcl_u16(setup, args->zs_read.bits); + rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset); } /* Clipping depends on tile coordinates having been @@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec, if (setup->zs_write) { rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->zs_write.bits | + rcl_u16(setup, args->zs_write.bits | (setup->color_ms_write ? VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0)); rcl_u32(setup, - (setup->zs_write->paddr + exec->args->zs_write.offset) | + (setup->zs_write->paddr + args->zs_write.offset) | ((last && !setup->color_ms_write) ? VC4_LOADSTORE_TILE_BUFFER_EOF : 0)); } @@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec, static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, struct vc4_rcl_setup *setup) { - bool has_bin = exec->args->bin_cl_size != 0; - uint8_t min_x_tile = exec->args->min_x_tile; - uint8_t min_y_tile = exec->args->min_y_tile; - uint8_t max_x_tile = exec->args->max_x_tile; - uint8_t max_y_tile = exec->args->max_y_tile; + struct drm_vc4_submit_cl *args = exec->args; + bool has_bin = args->bin_cl_size != 0; + uint8_t min_x_tile = args->min_x_tile; + uint8_t min_y_tile = args->min_y_tile; + uint8_t max_x_tile = args->max_x_tile; + uint8_t max_y_tile = args->max_y_tile; uint8_t xtiles = max_x_tile - min_x_tile + 1; uint8_t ytiles = max_y_tile - min_y_tile + 1; uint8_t x, y; @@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE; loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE; - if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { size += VC4_PACKET_CLEAR_COLORS_SIZE + VC4_PACKET_TILE_COORDINATES_SIZE + VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; @@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, } if (setup->zs_write) - loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE; + loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; if (setup->color_ms_write) { if (setup->zs_write) loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE; @@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, rcl_u32(setup, (setup->color_ms_write ? (setup->color_ms_write->paddr + - exec->args->color_ms_write.offset) : + args->color_ms_write.offset) : 0)); - rcl_u16(setup, exec->args->width); - rcl_u16(setup, exec->args->height); - rcl_u16(setup, exec->args->color_ms_write.bits); + rcl_u16(setup, args->width); + rcl_u16(setup, args->height); + rcl_u16(setup, args->color_ms_write.bits); /* The tile buffer gets cleared when the previous tile is stored. If * the clear values changed between frames, then the tile buffer has * stale clear values in it, so we have to do a store in None mode (no * writes) so that we trigger the tile buffer clear. */ - if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { rcl_u8(setup, VC4_PACKET_CLEAR_COLORS); - rcl_u32(setup, exec->args->clear_color[0]); - rcl_u32(setup, exec->args->clear_color[1]); - rcl_u32(setup, exec->args->clear_z); - rcl_u8(setup, exec->args->clear_s); + rcl_u32(setup, args->clear_color[0]); + rcl_u32(setup, args->clear_color[1]); + rcl_u32(setup, args->clear_z); + rcl_u8(setup, args->clear_s); vc4_tile_coordinates(setup, 0, 0); @@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec, if (surf->hindex == ~0) return 0; - if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + *obj = vc4_use_bo(exec, surf->hindex); + if (!*obj) return -EINVAL; if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK | @@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec, if (surf->hindex == ~0) return 0; - if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + *obj = vc4_use_bo(exec, surf->hindex); + if (!*obj) return -EINVAL; if (tiling > VC4_TILING_FORMAT_LT) { diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c index a0b67a7e50b..b248831113c 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c @@ -94,42 +94,42 @@ size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * utile_height(cpp)); } -bool -vc4_use_bo(struct vc4_exec_info *exec, - uint32_t hindex, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj) +struct drm_gem_cma_object * +vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex) { - *obj = NULL; + struct drm_gem_cma_object *obj; + struct drm_vc4_bo *bo; if (hindex >= exec->bo_count) { DRM_ERROR("BO index %d greater than BO count %d\n", hindex, exec->bo_count); - return false; + return NULL; } + obj = exec->bo[hindex]; + bo = to_vc4_bo(&obj->base); - if (exec->bo[hindex].mode != mode) { - if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) { - exec->bo[hindex].mode = mode; - } else { - DRM_ERROR("BO index %d reused with mode %d vs %d\n", - hindex, exec->bo[hindex].mode, mode); - return false; - } + if (bo->validated_shader) { + DRM_ERROR("Trying to use shader BO as something other than " + "a shader\n"); + return NULL; } - *obj = exec->bo[hindex].bo; - return true; + return obj; +} + +static struct drm_gem_cma_object * +vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index) +{ + return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]); } static bool -vc4_use_handle(struct vc4_exec_info *exec, - uint32_t gem_handles_packet_index, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj) +validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos) { - return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index], - mode, obj); + /* Note that the untrusted pointer passed to these functions is + * incremented past the packet byte. + */ + return (untrusted - 1 == exec->bin_u + pos); } static uint32_t @@ -201,14 +201,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo, return true; } + static int -validate_flush_all(VALIDATE_ARGS) +validate_flush(VALIDATE_ARGS) { - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("VC4_PACKET_FLUSH_ALL after " - "VC4_PACKET_INCREMENT_SEMAPHORE\n"); + if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) { + DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n"); return -EINVAL; } + exec->found_flush = true; return 0; } @@ -233,17 +234,13 @@ validate_start_tile_binning(VALIDATE_ARGS) static int validate_increment_semaphore(VALIDATE_ARGS) { - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n"); + if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) { + DRM_ERROR("Bin CL must end with " + "VC4_PACKET_INCREMENT_SEMAPHORE\n"); return -EINVAL; } exec->found_increment_semaphore_packet = true; - /* Once we've found the semaphore increment, there should be one FLUSH - * then the end of the command list. The FLUSH actually triggers the - * increment, so we only need to make sure there - */ - return 0; } @@ -257,11 +254,6 @@ validate_indexed_prim_list(VALIDATE_ARGS) uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1; struct vc4_shader_state *shader_state; - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n"); - return -EINVAL; - } - /* Check overflow condition */ if (exec->shader_state_count == 0) { DRM_ERROR("shader state must precede primitives\n"); @@ -272,7 +264,8 @@ validate_indexed_prim_list(VALIDATE_ARGS) if (max_index > shader_state->max_index) shader_state->max_index = max_index; - if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib)) + ib = vc4_use_handle(exec, 0); + if (!ib) return -EINVAL; if (offset > ib->base.size || @@ -295,11 +288,6 @@ validate_gl_array_primitive(VALIDATE_ARGS) uint32_t max_index; struct vc4_shader_state *shader_state; - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n"); - return -EINVAL; - } - /* Check overflow condition */ if (exec->shader_state_count == 0) { DRM_ERROR("shader state must precede primitives\n"); @@ -329,7 +317,6 @@ validate_gl_shader_state(VALIDATE_ARGS) return -EINVAL; } - exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE; exec->shader_state[i].addr = *(uint32_t *)untrusted; exec->shader_state[i].max_index = 0; @@ -348,31 +335,6 @@ validate_gl_shader_state(VALIDATE_ARGS) } static int -validate_nv_shader_state(VALIDATE_ARGS) -{ - uint32_t i = exec->shader_state_count++; - - if (i >= exec->shader_state_size) { - DRM_ERROR("More requests for shader states than declared\n"); - return -EINVAL; - } - - exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE; - exec->shader_state[i].addr = *(uint32_t *)untrusted; - - if (exec->shader_state[i].addr & 15) { - DRM_ERROR("NV shader state address 0x%08x misaligned\n", - exec->shader_state[i].addr); - return -EINVAL; - } - - *(uint32_t *)validated = (exec->shader_state[i].addr + - exec->shader_rec_p); - - return 0; -} - -static int validate_tile_binning_config(VALIDATE_ARGS) { struct drm_device *dev = exec->exec_bo->base.dev; @@ -473,8 +435,8 @@ static const struct cmd_info { } cmd_info[] = { VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL), VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL), - VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL), - VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL), VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning), VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore), @@ -488,7 +450,7 @@ static const struct cmd_info { VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL), VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state), - VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state), + /* We don't support validating NV shader states. */ VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL), VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL), @@ -525,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev, u8 cmd = *(uint8_t *)src_pkt; const struct cmd_info *info; - if (cmd > ARRAY_SIZE(cmd_info)) { + if (cmd >= ARRAY_SIZE(cmd_info)) { DRM_ERROR("0x%08x: packet %d out of bounds\n", src_offset, cmd); return -EINVAL; @@ -580,8 +542,16 @@ vc4_validate_bin_cl(struct drm_device *dev, return -EINVAL; } - if (!exec->found_increment_semaphore_packet) { - DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n"); + /* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH. The + * semaphore is used to trigger the render CL to start up, and the + * FLUSH is what caps the bin lists with + * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main + * render CL when they get called to) and actually triggers the queued + * semaphore increment. + */ + if (!exec->found_increment_semaphore_packet || !exec->found_flush) { + DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + " + "VC4_PACKET_FLUSH\n"); return -EINVAL; } @@ -612,18 +582,19 @@ reloc_tex(struct vc4_exec_info *exec, uint32_t cube_map_stride = 0; enum vc4_texture_data_type type; - if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex)) + tex = vc4_use_bo(exec, texture_handle_index); + if (!tex) return false; if (sample->is_direct) { uint32_t remaining_size = tex->base.size - p0; if (p0 > tex->base.size - 4) { DRM_ERROR("UBO offset greater than UBO size\n"); - return false; + goto fail; } if (p1 > remaining_size - 4) { DRM_ERROR("UBO clamp would allow reads outside of UBO\n"); - return false; + goto fail; } *validated_p0 = tex->paddr + p0; return true; @@ -642,14 +613,14 @@ reloc_tex(struct vc4_exec_info *exec, VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) { if (cube_map_stride) { DRM_ERROR("Cube map stride set twice\n"); - return false; + goto fail; } cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK; } if (!cube_map_stride) { DRM_ERROR("Cube map stride not set\n"); - return false; + goto fail; } } @@ -683,7 +654,7 @@ reloc_tex(struct vc4_exec_info *exec, case VC4_TEXTURE_TYPE_YUV422R: default: DRM_ERROR("Texture format %d unsupported\n", type); - return false; + goto fail; } utile_w = utile_width(cpp); utile_h = utile_height(cpp); @@ -699,7 +670,7 @@ reloc_tex(struct vc4_exec_info *exec, if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5, tiling_format, width, height, cpp)) { - return false; + goto fail; } /* The mipmap levels are stored before the base of the texture. Make @@ -740,7 +711,7 @@ reloc_tex(struct vc4_exec_info *exec, i, level_width, level_height, aligned_width, aligned_height, level_size, offset); - return false; + goto fail; } offset -= level_size; @@ -749,54 +720,37 @@ reloc_tex(struct vc4_exec_info *exec, *validated_p0 = tex->paddr + p0; return true; + fail: + DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0); + DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1); + DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2); + DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3); + return false; } static int -validate_shader_rec(struct drm_device *dev, - struct vc4_exec_info *exec, - struct vc4_shader_state *state) +validate_gl_shader_rec(struct drm_device *dev, + struct vc4_exec_info *exec, + struct vc4_shader_state *state) { uint32_t *src_handles; void *pkt_u, *pkt_v; - enum shader_rec_reloc_type { - RELOC_CODE, - RELOC_VBO, - }; - struct shader_rec_reloc { - enum shader_rec_reloc_type type; - uint32_t offset; - }; - static const struct shader_rec_reloc gl_relocs[] = { - { RELOC_CODE, 4 }, /* fs */ - { RELOC_CODE, 16 }, /* vs */ - { RELOC_CODE, 28 }, /* cs */ + static const uint32_t shader_reloc_offsets[] = { + 4, /* fs */ + 16, /* vs */ + 28, /* cs */ }; - static const struct shader_rec_reloc nv_relocs[] = { - { RELOC_CODE, 4 }, /* fs */ - { RELOC_VBO, 12 } - }; - const struct shader_rec_reloc *relocs; - struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8]; - uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size; + uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets); + struct drm_gem_cma_object *bo[shader_reloc_count + 8]; + uint32_t nr_attributes, nr_relocs, packet_size; int i; - struct vc4_validated_shader_info *validated_shader = NULL; - - if (state->packet == VC4_PACKET_NV_SHADER_STATE) { - relocs = nv_relocs; - nr_fixed_relocs = ARRAY_SIZE(nv_relocs); - packet_size = 16; - } else { - relocs = gl_relocs; - nr_fixed_relocs = ARRAY_SIZE(gl_relocs); - - nr_attributes = state->addr & 0x7; - if (nr_attributes == 0) - nr_attributes = 8; - packet_size = gl_shader_rec_size(state->addr); - } - nr_relocs = nr_fixed_relocs + nr_attributes; + nr_attributes = state->addr & 0x7; + if (nr_attributes == 0) + nr_attributes = 8; + packet_size = gl_shader_rec_size(state->addr); + nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes; if (nr_relocs * 4 > exec->shader_rec_size) { DRM_ERROR("overflowed shader recs reading %d handles " "from %d bytes left\n", @@ -826,21 +780,30 @@ validate_shader_rec(struct drm_device *dev, exec->shader_rec_v += roundup(packet_size, 16); exec->shader_rec_size -= packet_size; - for (i = 0; i < nr_relocs; i++) { - enum vc4_bo_mode mode; - - if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE) - mode = VC4_MODE_SHADER; - else - mode = VC4_MODE_RENDER; + if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { + DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); + return -EINVAL; + } - if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) { - return false; + for (i = 0; i < shader_reloc_count; i++) { + if (src_handles[i] > exec->bo_count) { + DRM_ERROR("Shader handle %d too big\n", src_handles[i]); + return -EINVAL; } + + bo[i] = exec->bo[src_handles[i]]; + if (!bo[i]) + return -EINVAL; + } + for (i = shader_reloc_count; i < nr_relocs; i++) { + bo[i] = vc4_use_bo(exec, src_handles[i]); + if (!bo[i]) + return -EINVAL; } - for (i = 0; i < nr_fixed_relocs; i++) { - uint32_t o = relocs[i].offset; + for (i = 0; i < shader_reloc_count; i++) { + struct vc4_validated_shader_info *validated_shader; + uint32_t o = shader_reloc_offsets[i]; uint32_t src_offset = *(uint32_t *)(pkt_u + o); uint32_t *texture_handles_u; void *uniform_data_u; @@ -848,58 +811,50 @@ validate_shader_rec(struct drm_device *dev, *(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset; - switch (relocs[i].type) { - case RELOC_CODE: - if (src_offset != 0) { - DRM_ERROR("Shaders must be at offset 0 of " - "the BO.\n"); - goto fail; - } + if (src_offset != 0) { + DRM_ERROR("Shaders must be at offset 0 of " + "the BO.\n"); + return -EINVAL; + } - kfree(validated_shader); - validated_shader = vc4_validate_shader(bo[i]); - if (!validated_shader) - goto fail; + validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader; + if (!validated_shader) + return -EINVAL; - if (validated_shader->uniforms_src_size > - exec->uniforms_size) { - DRM_ERROR("Uniforms src buffer overflow\n"); - goto fail; - } + if (validated_shader->uniforms_src_size > + exec->uniforms_size) { + DRM_ERROR("Uniforms src buffer overflow\n"); + return -EINVAL; + } - texture_handles_u = exec->uniforms_u; - uniform_data_u = (texture_handles_u + - validated_shader->num_texture_samples); - - memcpy(exec->uniforms_v, uniform_data_u, - validated_shader->uniforms_size); - - for (tex = 0; - tex < validated_shader->num_texture_samples; - tex++) { - if (!reloc_tex(exec, - uniform_data_u, - &validated_shader->texture_samples[tex], - texture_handles_u[tex])) { - goto fail; - } - } + texture_handles_u = exec->uniforms_u; + uniform_data_u = (texture_handles_u + + validated_shader->num_texture_samples); - *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p; + memcpy(exec->uniforms_v, uniform_data_u, + validated_shader->uniforms_size); - exec->uniforms_u += validated_shader->uniforms_src_size; - exec->uniforms_v += validated_shader->uniforms_size; - exec->uniforms_p += validated_shader->uniforms_size; + for (tex = 0; + tex < validated_shader->num_texture_samples; + tex++) { + if (!reloc_tex(exec, + uniform_data_u, + &validated_shader->texture_samples[tex], + texture_handles_u[tex])) { + return -EINVAL; + } + } - break; + *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p; - case RELOC_VBO: - break; - } + exec->uniforms_u += validated_shader->uniforms_src_size; + exec->uniforms_v += validated_shader->uniforms_size; + exec->uniforms_p += validated_shader->uniforms_size; } for (i = 0; i < nr_attributes; i++) { - struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i]; + struct drm_gem_cma_object *vbo = + bo[ARRAY_SIZE(shader_reloc_offsets) + i]; uint32_t o = 36 + i * 8; uint32_t offset = *(uint32_t *)(pkt_u + o + 0); uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1; @@ -929,13 +884,7 @@ validate_shader_rec(struct drm_device *dev, *(uint32_t *)(pkt_v + o) = vbo->paddr + offset; } - kfree(validated_shader); - return 0; - -fail: - kfree(validated_shader); - return -EINVAL; } int @@ -946,7 +895,7 @@ vc4_validate_shader_recs(struct drm_device *dev, int ret = 0; for (i = 0; i < exec->shader_state_count; i++) { - ret = validate_shader_rec(dev, exec, &exec->shader_state[i]); + ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]); if (ret) return ret; } diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c index d29e2c9c318..e52a1941730 100644 --- a/src/gallium/drivers/vc4/vc4_blit.c +++ b/src/gallium/drivers/vc4/vc4_blit.c @@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info) struct vc4_context *vc4 = vc4_context(ctx); if (!util_blitter_is_blit_supported(vc4->blitter, info)) { - fprintf(stderr, "blit unsupported %s -> %s", + fprintf(stderr, "blit unsupported %s -> %s\n", util_format_short_name(info->src.resource->format), util_format_short_name(info->dst.resource->format)); return false; @@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) info.dst.resource->nr_samples <= 1 && !util_format_is_depth_or_stencil(info.src.resource->format) && !util_format_is_pure_integer(info.src.resource->format)) { - fprintf(stderr, "color resolve unimplemented"); + fprintf(stderr, "color resolve unimplemented\n"); return; } @@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) } if (info.mask & PIPE_MASK_S) { - fprintf(stderr, "cannot blit stencil, skipping"); + fprintf(stderr, "cannot blit stencil, skipping\n"); info.mask &= ~PIPE_MASK_S; } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index cbdb9e89cf6..f7b41f5816d 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -1,5 +1,5 @@ /* - * Copyright © 2014 Broadcom + * Copyright © 2014-2015 Broadcom * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) * allocate something new instead, since we assume that the * user will proceed to CPU map it and fill it with stuff. */ - if (!vc4_bo_wait(bo, 0)) { + if (!vc4_bo_wait(bo, 0, NULL)) { pipe_mutex_unlock(cache->lock); return NULL; } @@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo) } struct vc4_bo * -vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size, - const char *name) +vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size) { - void *map; struct vc4_bo *bo; + int ret; + + bo = CALLOC_STRUCT(vc4_bo); + if (!bo) + return NULL; + + pipe_reference_init(&bo->reference, 1); + bo->screen = screen; + bo->size = align(size, 4096); + bo->name = "code"; + bo->private = false; /* Make sure it doesn't go back to the cache. */ + + if (!using_vc4_simulator) { + struct drm_vc4_create_shader_bo create = { + .size = size, + .data = (uintptr_t)data, + }; + + ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO, + &create); + bo->handle = create.handle; + } else { + struct drm_mode_create_dumb create; + memset(&create, 0, sizeof(create)); + + create.width = 128; + create.bpp = 8; + create.height = (size + 127) / 128; + + ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); + bo->handle = create.handle; + assert(create.size >= size); + + vc4_bo_map(bo); + memcpy(bo->map, data, size); + } + if (ret != 0) { + fprintf(stderr, "create shader ioctl failure\n"); + abort(); + } + + screen->bo_count++; + screen->bo_size += bo->size; + if (dump_stats) { + fprintf(stderr, "Allocated shader %dkb:\n", size / 1024); + vc4_bo_dump_stats(screen); + } - bo = vc4_bo_alloc(screen, size, name); - map = vc4_bo_map(bo); - memcpy(map, data, size); return bo; } @@ -413,63 +455,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name) return true; } +static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns) +{ + if (using_vc4_simulator) + return 0; + + struct drm_vc4_wait_seqno wait = { + .seqno = seqno, + .timeout_ns = timeout_ns, + }; + int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); + if (ret == -1) + return -errno; + else + return 0; + +} + bool -vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns) +vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns, + const char *reason) { if (screen->finished_seqno >= seqno) return true; - struct drm_vc4_wait_seqno wait; - memset(&wait, 0, sizeof(wait)); - wait.seqno = seqno; - wait.timeout_ns = timeout_ns; - - int ret; - if (!using_vc4_simulator) - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); - else { - wait.seqno = screen->finished_seqno; - ret = 0; + if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) { + if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) { + fprintf(stderr, "Blocking on seqno %lld for %s\n", + (long long)seqno, reason); + } } - if (ret == 0) { - screen->finished_seqno = wait.seqno; - return true; - } + int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns); + if (ret) { + if (ret != -ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); + } - if (errno != ETIME) { - fprintf(stderr, "wait failed: %d\n", ret); - abort(); + return false; } - return false; + screen->finished_seqno = seqno; + return true; +} + +static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns) +{ + if (using_vc4_simulator) + return 0; + + struct drm_vc4_wait_bo wait = { + .handle = handle, + .timeout_ns = timeout_ns, + }; + int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait); + if (ret == -1) + return -errno; + else + return 0; + } bool -vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns) +vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason) { struct vc4_screen *screen = bo->screen; - struct drm_vc4_wait_bo wait; - memset(&wait, 0, sizeof(wait)); - wait.handle = bo->handle; - wait.timeout_ns = timeout_ns; - - int ret; - if (!using_vc4_simulator) - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait); - else - ret = 0; + if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) { + if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) { + fprintf(stderr, "Blocking on %s BO for %s\n", + bo->name, reason); + } + } - if (ret == 0) - return true; + int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns); + if (ret) { + if (ret != -ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); + } - if (errno != ETIME) { - fprintf(stderr, "wait failed: %d\n", ret); - abort(); + return false; } - return false; + return true; } void * @@ -515,7 +585,7 @@ vc4_bo_map(struct vc4_bo *bo) { void *map = vc4_bo_map_unsynchronized(bo); - bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE); + bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map"); if (!ok) { fprintf(stderr, "BO wait for map failed\n"); abort(); diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h index 7320695ca8e..b77506e242a 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -58,8 +58,8 @@ struct vc4_bo { struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name); -struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, - uint32_t size, const char *name); +struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, + uint32_t size); void vc4_bo_last_unreference(struct vc4_bo *bo); void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time); struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, @@ -113,10 +113,11 @@ void * vc4_bo_map_unsynchronized(struct vc4_bo *bo); bool -vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns); +vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason); bool -vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns); +vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns, + const char *reason); void vc4_bufmgr_destroy(struct pipe_screen *pscreen); diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c index 0700e885cbf..ced4f2dfa86 100644 --- a/src/gallium/drivers/vc4/vc4_cl.c +++ b/src/gallium/drivers/vc4/vc4_cl.c @@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl) void cl_ensure_space(struct vc4_cl *cl, uint32_t space) { - if ((cl->next - cl->base) + space <= cl->size) + uint32_t offset = cl_offset(cl); + + if (offset + space <= cl->size) return; uint32_t size = MAX2(cl->size + space, cl->size * 2); - uint32_t offset = cl->next -cl->base; cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size); cl->size = size; @@ -60,15 +61,20 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo) uint32_t hindex; uint32_t *current_handles = vc4->bo_handles.base; - for (hindex = 0; - hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4; - hindex++) { + for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) { if (current_handles[hindex] == bo->handle) return hindex; } - cl_u32(&vc4->bo_handles, bo->handle); - cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo)); + struct vc4_cl_out *out; + + out = cl_start(&vc4->bo_handles); + cl_u32(&out, bo->handle); + cl_end(&vc4->bo_handles, out); + + out = cl_start(&vc4->bo_pointers); + cl_ptr(&out, vc4_bo_reference(bo)); + cl_end(&vc4->bo_pointers, out); return hindex; } diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h index 4a50e790942..bf4be0efc29 100644 --- a/src/gallium/drivers/vc4/vc4_cl.h +++ b/src/gallium/drivers/vc4/vc4_cl.h @@ -33,12 +33,20 @@ struct vc4_bo; +/** + * Undefined structure, used for typechecking that you're passing the pointers + * to these functions correctly. + */ +struct vc4_cl_out; + struct vc4_cl { void *base; - void *next; + struct vc4_cl_out *next; + struct vc4_cl_out *reloc_next; uint32_t size; - uint32_t reloc_next; +#ifdef DEBUG uint32_t reloc_count; +#endif }; void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl); @@ -49,135 +57,149 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo); struct PACKED unaligned_16 { uint16_t x; }; struct PACKED unaligned_32 { uint32_t x; }; -static inline void -put_unaligned_32(void *ptr, uint32_t val) +static inline uint32_t cl_offset(struct vc4_cl *cl) { - struct unaligned_32 *p = ptr; - p->x = val; + return (char *)cl->next - (char *)cl->base; } static inline void -put_unaligned_16(void *ptr, uint16_t val) +cl_advance(struct vc4_cl_out **cl, uint32_t n) { - struct unaligned_16 *p = ptr; - p->x = val; + (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n); } -static inline void -cl_u8(struct vc4_cl *cl, uint8_t n) +static inline struct vc4_cl_out * +cl_start(struct vc4_cl *cl) { - assert((cl->next - cl->base) + 1 <= cl->size); - - *(uint8_t *)cl->next = n; - cl->next++; + return cl->next; } static inline void -cl_u16(struct vc4_cl *cl, uint16_t n) +cl_end(struct vc4_cl *cl, struct vc4_cl_out *next) { - assert((cl->next - cl->base) + 2 <= cl->size); + cl->next = next; + assert(cl_offset(cl) <= cl->size); +} - put_unaligned_16(cl->next, n); - cl->next += 2; + +static inline void +put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val) +{ + struct unaligned_32 *p = (void *)ptr; + p->x = val; } static inline void -cl_u32(struct vc4_cl *cl, uint32_t n) +put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val) { - assert((cl->next - cl->base) + 4 <= cl->size); + struct unaligned_16 *p = (void *)ptr; + p->x = val; +} - put_unaligned_32(cl->next, n); - cl->next += 4; +static inline void +cl_u8(struct vc4_cl_out **cl, uint8_t n) +{ + *(uint8_t *)(*cl) = n; + cl_advance(cl, 1); } static inline void -cl_aligned_u32(struct vc4_cl *cl, uint32_t n) +cl_u16(struct vc4_cl_out **cl, uint16_t n) { - assert((cl->next - cl->base) + 4 <= cl->size); + put_unaligned_16(*cl, n); + cl_advance(cl, 2); +} - *(uint32_t *)cl->next = n; - cl->next += 4; +static inline void +cl_u32(struct vc4_cl_out **cl, uint32_t n) +{ + put_unaligned_32(*cl, n); + cl_advance(cl, 4); } static inline void -cl_ptr(struct vc4_cl *cl, void *ptr) +cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n) { - assert((cl->next - cl->base) + sizeof(void *) <= cl->size); + *(uint32_t *)(*cl) = n; + cl_advance(cl, 4); +} - *(void **)cl->next = ptr; - cl->next += sizeof(void *); +static inline void +cl_ptr(struct vc4_cl_out **cl, void *ptr) +{ + *(struct vc4_cl_out **)(*cl) = ptr; + cl_advance(cl, sizeof(void *)); } static inline void -cl_f(struct vc4_cl *cl, float f) +cl_f(struct vc4_cl_out **cl, float f) { cl_u32(cl, fui(f)); } static inline void -cl_aligned_f(struct vc4_cl *cl, float f) +cl_aligned_f(struct vc4_cl_out **cl, float f) { cl_aligned_u32(cl, fui(f)); } static inline void -cl_start_reloc(struct vc4_cl *cl, uint32_t n) +cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n) { assert(n == 1 || n == 2); +#ifdef DEBUG assert(cl->reloc_count == 0); cl->reloc_count = n; +#endif - cl_u8(cl, VC4_PACKET_GEM_HANDLES); - cl->reloc_next = cl->next - cl->base; - cl_u32(cl, 0); /* Space where hindex will be written. */ - cl_u32(cl, 0); /* Space where hindex will be written. */ + cl_u8(out, VC4_PACKET_GEM_HANDLES); + cl->reloc_next = *out; + cl_u32(out, 0); /* Space where hindex will be written. */ + cl_u32(out, 0); /* Space where hindex will be written. */ } -static inline void +static inline struct vc4_cl_out * cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n) { +#ifdef DEBUG assert(cl->reloc_count == 0); cl->reloc_count = n; - cl->reloc_next = cl->next - cl->base; +#endif + cl->reloc_next = cl->next; + + /* Reserve the space where hindex will be written. */ + cl_advance(&cl->next, n * 4); - /* Space where hindex will be written. */ - cl->next += n * 4; + return cl->next; } static inline void -cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset) +cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out, + struct vc4_bo *bo, uint32_t offset) { - *(uint32_t *)(cl->base + cl->reloc_next) = hindex; - cl->reloc_next += 4; + *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); + cl_advance(&cl->reloc_next, 4); +#ifdef DEBUG cl->reloc_count--; +#endif - cl_u32(cl, offset); + cl_u32(cl_out, offset); } static inline void -cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset) +cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl, + struct vc4_cl_out **cl_out, + struct vc4_bo *bo, uint32_t offset) { - *(uint32_t *)(cl->base + cl->reloc_next) = hindex; - cl->reloc_next += 4; + *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); + cl_advance(&cl->reloc_next, 4); +#ifdef DEBUG cl->reloc_count--; +#endif - cl_aligned_u32(cl, offset); -} - -static inline void -cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, - struct vc4_bo *bo, uint32_t offset) -{ - cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset); -} - -static inline void -cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl, - struct vc4_bo *bo, uint32_t offset) -{ - cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset); + cl_aligned_u32(cl_out, offset); } void cl_ensure_space(struct vc4_cl *cl, uint32_t size); diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c index 69055081daa..6d748010baf 100644 --- a/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset) void *f = cl + offset; fprintf(stderr, "0x%08x 0x%08x: %f (0x%08x)\n", - offset, hw_offset, *(float *)f, *(uint32_t *)f); + offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f); } static void @@ -47,7 +47,33 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset } static void -dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset) +{ + uint32_t bits = *(uint32_t *)(cl + offset); + + fprintf(stderr, "0x%08x 0x%08x: addr 0x%08x%s%s%s%s\n", + offset, hw_offset, + bits & ~0xf, + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear", + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs", + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color", + (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : ""); +} + +static void +dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_full(cl, offset, hw_offset); +} + +static void +dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_full(cl, offset, hw_offset); +} + +static void +dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset) { uint8_t *bytes = cl + offset; uint32_t *addr = cl + offset + 2; @@ -125,6 +151,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw } static void +dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_general(cl, offset, hw_offset); +} + +static void +dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_general(cl, offset, hw_offset); +} + +static void dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset) { uint32_t *bits = cl + offset; @@ -291,63 +329,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset) offset, hw_offset, handles[0], handles[1]); } -#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name } -#define PACKET(name, size) [name] = { #name, size, NULL } +#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name } +#define PACKET(name) [name] = { #name, name ## _SIZE, NULL } static const struct packet_info { const char *name; uint8_t size; void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset); } packet_info[] = { - PACKET(VC4_PACKET_HALT, 1), - PACKET(VC4_PACKET_NOP, 1), - - PACKET(VC4_PACKET_FLUSH, 1), - PACKET(VC4_PACKET_FLUSH_ALL, 1), - PACKET(VC4_PACKET_START_TILE_BINNING, 1), - PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1), - PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1), - - PACKET(VC4_PACKET_BRANCH, 5), - PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5), - - PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1), - PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1), - PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5), - PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5), - PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7), - PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7), - - PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14), - PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10), - - PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48), - PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49), - - PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2), - - PACKET(VC4_PACKET_GL_SHADER_STATE, 5), - PACKET(VC4_PACKET_NV_SHADER_STATE, 5), - PACKET(VC4_PACKET_VG_SHADER_STATE, 5), - - PACKET(VC4_PACKET_CONFIGURATION_BITS, 4), - PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5), - PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5), - PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5), - PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3), - PACKET(VC4_PACKET_DEPTH_OFFSET, 5), - PACKET(VC4_PACKET_CLIP_WINDOW, 9), - PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5), - PACKET(VC4_PACKET_Z_CLIPPING, 9), - PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9), - PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9), - - PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16), - PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11), - PACKET(VC4_PACKET_CLEAR_COLORS, 14), - PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3), - - PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9), + PACKET(VC4_PACKET_HALT), + PACKET(VC4_PACKET_NOP), + + PACKET(VC4_PACKET_FLUSH), + PACKET(VC4_PACKET_FLUSH_ALL), + PACKET(VC4_PACKET_START_TILE_BINNING), + PACKET(VC4_PACKET_INCREMENT_SEMAPHORE), + PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE), + + PACKET(VC4_PACKET_BRANCH), + PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST), + + PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER), + PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF), + PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER), + PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER), + PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL), + PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL), + + PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE), + PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE), + + PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE), + PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE), + + PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT), + + PACKET(VC4_PACKET_GL_SHADER_STATE), + PACKET(VC4_PACKET_NV_SHADER_STATE), + PACKET(VC4_PACKET_VG_SHADER_STATE), + + PACKET(VC4_PACKET_CONFIGURATION_BITS), + PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS), + PACKET_DUMP(VC4_PACKET_POINT_SIZE), + PACKET_DUMP(VC4_PACKET_LINE_WIDTH), + PACKET(VC4_PACKET_RHT_X_BOUNDARY), + PACKET(VC4_PACKET_DEPTH_OFFSET), + PACKET(VC4_PACKET_CLIP_WINDOW), + PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET), + PACKET(VC4_PACKET_Z_CLIPPING), + PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING), + PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING), + + PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG), + PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG), + PACKET(VC4_PACKET_CLEAR_COLORS), + PACKET_DUMP(VC4_PACKET_TILE_COORDINATES), + + PACKET_DUMP(VC4_PACKET_GEM_HANDLES), }; void @@ -359,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render) while (offset < size) { uint8_t header = cmds[offset]; - if (header > ARRAY_SIZE(packet_info) || + if (header >= ARRAY_SIZE(packet_info) || !packet_info[header].name) { fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n", offset, hw_offset, header, header); diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 630f8e68896..fff63158c9d 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx) * FLUSH completes. */ cl_ensure_space(&vc4->bcl, 8); - cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); + cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE); /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */ - cl_u8(&vc4->bcl, VC4_PACKET_FLUSH); + cl_u8(&bcl, VC4_PACKET_FLUSH); + cl_end(&vc4->bcl, bcl); if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_write, cbuf); @@ -103,8 +105,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, vc4_flush(pctx); if (fence) { + struct pipe_screen *screen = pctx->screen; struct vc4_fence *f = vc4_fence_create(vc4->screen, vc4->last_emit_seqno); + screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle *)f; } } @@ -126,8 +130,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo) * they match. */ struct vc4_bo **referenced_bos = vc4->bo_pointers.base; - for (int i = 0; i < (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; i++) { + for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) { if (referenced_bos[i] == bo) { return true; } diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index d5d6be16f6e..654c46f3c0d 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -67,7 +67,20 @@ #define VC4_DIRTY_CLIP (1 << 20) #define VC4_DIRTY_UNCOMPILED_VS (1 << 21) #define VC4_DIRTY_UNCOMPILED_FS (1 << 22) -#define VC4_DIRTY_COMPILED_FS (1 << 24) +#define VC4_DIRTY_COMPILED_CS (1 << 23) +#define VC4_DIRTY_COMPILED_VS (1 << 24) +#define VC4_DIRTY_COMPILED_FS (1 << 25) + +struct vc4_sampler_view { + struct pipe_sampler_view base; + uint32_t texture_p0; + uint32_t texture_p1; +}; + +struct vc4_sampler_state { + struct pipe_sampler_state base; + uint32_t texture_p1; +}; struct vc4_texture_stateobj { struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS]; @@ -121,6 +134,12 @@ struct vc4_compiled_shader { struct vc4_ubo_range *ubo_ranges; uint32_t num_ubo_ranges; uint32_t ubo_size; + /** + * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the + * uniforms have to be rewritten (and therefore the shader state + * reemitted). + */ + uint32_t uniform_dirty_bits; /** bitmask of which inputs are color inputs, for flat shade handling. */ uint32_t color_inputs; @@ -238,6 +257,11 @@ struct vc4_context { */ bool draw_call_queued; + /** Maximum index buffer valid for the current shader_rec. */ + uint32_t max_index; + /** Last index bias baked into the current shader_rec. */ + uint32_t last_index_bias; + struct primconvert_context *primconvert; struct hash_table *fs_cache, *vs_cache; @@ -246,6 +270,7 @@ struct vc4_context { struct ra_regs *regs; unsigned int reg_class_any; + unsigned int reg_class_r4_or_a; unsigned int reg_class_a; uint8_t prim_mode; @@ -326,6 +351,18 @@ vc4_context(struct pipe_context *pcontext) return (struct vc4_context *)pcontext; } +static inline struct vc4_sampler_view * +vc4_sampler_view(struct pipe_sampler_view *psview) +{ + return (struct vc4_sampler_view *)psview; +} + +static inline struct vc4_sampler_state * +vc4_sampler_state(struct pipe_sampler_state *psampler) +{ + return (struct vc4_sampler_state *)psampler; +} + struct pipe_context *vc4_context_create(struct pipe_screen *pscreen, void *priv); void vc4_draw_init(struct pipe_context *pctx); @@ -337,6 +374,7 @@ void vc4_simulator_init(struct vc4_screen *screen); int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args); +void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader); void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 5e6d70d6f33..a4e5e092b1a 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4) uint32_t height = vc4->framebuffer.height; uint32_t tilew = align(width, 64) / 64; uint32_t tileh = align(height, 64) / 64; + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); // Tile state data is 48 bytes per tile, I think it can be thrown away // as soon as binning is finished. - cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); - cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */ - cl_u8(&vc4->bcl, tilew); - cl_u8(&vc4->bcl, tileh); - cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */ + cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); + cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */ + cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */ + cl_u32(&bcl, 0); /* tile state addr, filled by kernel */ + cl_u8(&bcl, tilew); + cl_u8(&bcl, tileh); + cl_u8(&bcl, 0); /* flags, filled by kernel. */ /* START_TILE_BINNING resets the statechange counters in the hardware, * which are what is used when a primitive is binned to a tile to * figure out what new state packets need to be written to that tile's * command list. */ - cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING); + cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING); /* Reset the current compressed primitives format. This gets modified * by VC4_PACKET_GL_INDEXED_PRIMITIVE and * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start * of every tile. */ - cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); - cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | - VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); + cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); + cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | + VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); vc4->needs_flush = true; vc4->draw_call_queued = true; vc4->draw_width = width; vc4->draw_height = height; + + cl_end(&vc4->bcl, bcl); } static void @@ -119,96 +122,67 @@ vc4_update_shadow_textures(struct pipe_context *pctx, } static void -vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info) { - struct vc4_context *vc4 = vc4_context(pctx); - - if (info->mode >= PIPE_PRIM_QUADS) { - util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); - util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); - util_primconvert_draw_vbo(vc4->primconvert, info); - perf_debug("Fallback conversion for %d %s vertices\n", - info->count, u_prim_name(info->mode)); - return; - } - - /* Before setting up the draw, do any fixup blits necessary. */ - vc4_update_shadow_textures(pctx, &vc4->verttex); - vc4_update_shadow_textures(pctx, &vc4->fragtex); - - vc4_get_draw_cl_space(vc4); - + /* VC4_DIRTY_VTXSTATE */ struct vc4_vertex_stateobj *vtx = vc4->vtx; + /* VC4_DIRTY_VTXBUF */ struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf; - if (vc4->prim_mode != info->mode) { - vc4->prim_mode = info->mode; - vc4->dirty |= VC4_DIRTY_PRIM_MODE; - } - - vc4_start_draw(vc4); - vc4_update_compiled_shaders(vc4, info->mode); - - vc4_emit_state(pctx); - vc4->dirty = 0; - - vc4_write_uniforms(vc4, vc4->prog.fs, - &vc4->constbuf[PIPE_SHADER_FRAGMENT], - &vc4->fragtex); - vc4_write_uniforms(vc4, vc4->prog.vs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - vc4_write_uniforms(vc4, vc4->prog.cs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - /* The simulator throws a fit if VS or CS don't read an attribute, so * we emit a dummy read. */ uint32_t num_elements_emit = MAX2(vtx->num_elements, 1); /* Emit the shader record. */ - cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); - cl_u16(&vc4->shader_rec, + struct vc4_cl_out *shader_rec = + cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); + /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */ + cl_u16(&shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING | + VC4_SHADER_FLAG_FS_SINGLE_THREAD | ((info->mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex) ? VC4_SHADER_FLAG_VS_POINT_SIZE : 0)); - cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */ - cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_FS */ + cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */ + cl_u8(&shader_rec, vc4->prog.fs->num_inputs); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_VS */ + cl_u16(&shader_rec, 0); /* vs num uniforms */ + cl_u8(&shader_rec, vc4->prog.vs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_CS */ + cl_u16(&shader_rec, 0); /* cs num uniforms */ + cl_u8(&shader_rec, vc4->prog.cs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ uint32_t max_index = 0xffff; - uint32_t vpm_offset = 0; for (int i = 0; i < vtx->num_elements; i++) { struct pipe_vertex_element *elem = &vtx->pipe[i]; struct pipe_vertex_buffer *vb = &vertexbuf->vb[elem->vertex_buffer_index]; struct vc4_resource *rsc = vc4_resource(vb->buffer); - uint32_t offset = vb->buffer_offset + elem->src_offset; + /* not vc4->dirty tracked: vc4->last_index_bias */ + uint32_t offset = (vb->buffer_offset + + elem->src_offset + + vb->stride * info->index_bias); uint32_t vb_size = rsc->bo->size - offset; uint32_t elem_size = util_format_get_blocksize(elem->src_format); - cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset); - cl_u8(&vc4->shader_rec, elem_size - 1); - cl_u8(&vc4->shader_rec, vb->stride); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]); - - vpm_offset += align(elem_size, 4); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset); + cl_u8(&shader_rec, elem_size - 1); + cl_u8(&shader_rec, vb->stride); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]); if (vb->stride > 0) { max_index = MIN2(max_index, @@ -219,25 +193,89 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (vtx->num_elements == 0) { assert(num_elements_emit == 1); struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO"); - cl_reloc(vc4, &vc4->shader_rec, bo, 0); - cl_u8(&vc4->shader_rec, 16 - 1); /* element size */ - cl_u8(&vc4->shader_rec, 0); /* stride */ - cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */ - cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */ + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0); + cl_u8(&shader_rec, 16 - 1); /* element size */ + cl_u8(&shader_rec, 0); /* stride */ + cl_u8(&shader_rec, 0); /* VS VPM offset */ + cl_u8(&shader_rec, 0); /* CS VPM offset */ vc4_bo_unreference(&bo); } + cl_end(&vc4->shader_rec, shader_rec); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); /* the actual draw call. */ - cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE); + cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE); assert(vtx->num_elements <= 8); /* Note that number of attributes == 0 in the packet means 8 * attributes. This field also contains the offset into shader_rec. */ - cl_u32(&vc4->bcl, num_elements_emit & 0x7); + cl_u32(&bcl, num_elements_emit & 0x7); + cl_end(&vc4->bcl, bcl); + + vc4_write_uniforms(vc4, vc4->prog.fs, + &vc4->constbuf[PIPE_SHADER_FRAGMENT], + &vc4->fragtex); + vc4_write_uniforms(vc4, vc4->prog.vs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + vc4_write_uniforms(vc4, vc4->prog.cs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + + vc4->last_index_bias = info->index_bias; + vc4->max_index = max_index; +} + +static void +vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +{ + struct vc4_context *vc4 = vc4_context(pctx); + + if (info->mode >= PIPE_PRIM_QUADS) { + util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); + util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); + util_primconvert_draw_vbo(vc4->primconvert, info); + perf_debug("Fallback conversion for %d %s vertices\n", + info->count, u_prim_name(info->mode)); + return; + } + + /* Before setting up the draw, do any fixup blits necessary. */ + vc4_update_shadow_textures(pctx, &vc4->verttex); + vc4_update_shadow_textures(pctx, &vc4->fragtex); + + vc4_get_draw_cl_space(vc4); + + if (vc4->prim_mode != info->mode) { + vc4->prim_mode = info->mode; + vc4->dirty |= VC4_DIRTY_PRIM_MODE; + } + + vc4_start_draw(vc4); + vc4_update_compiled_shaders(vc4, info->mode); + + vc4_emit_state(pctx); + + if ((vc4->dirty & (VC4_DIRTY_VTXBUF | + VC4_DIRTY_VTXSTATE | + VC4_DIRTY_PRIM_MODE | + VC4_DIRTY_RASTERIZER | + VC4_DIRTY_COMPILED_CS | + VC4_DIRTY_COMPILED_VS | + VC4_DIRTY_COMPILED_FS | + vc4->prog.cs->uniform_dirty_bits | + vc4->prog.vs->uniform_dirty_bits | + vc4->prog.fs->uniform_dirty_bits)) || + vc4->last_index_bias != info->index_bias) { + vc4_emit_gl_shader_state(vc4, info); + } + + vc4->dirty = 0; /* Note that the primitive type fields match with OpenGL/gallium * definitions, up to but not including QUADS. */ + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (info->indexed) { uint32_t offset = vc4->indexbuf.offset; uint32_t index_size = vc4->indexbuf.index_size; @@ -251,25 +289,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) } struct vc4_resource *rsc = vc4_resource(prsc); - cl_start_reloc(&vc4->bcl, 1); - cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); - cl_u8(&vc4->bcl, + cl_start_reloc(&vc4->bcl, &bcl, 1); + cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); + cl_u8(&bcl, info->mode | (index_size == 2 ? VC4_INDEX_BUFFER_U16: VC4_INDEX_BUFFER_U8)); - cl_u32(&vc4->bcl, info->count); - cl_reloc(vc4, &vc4->bcl, rsc->bo, offset); - cl_u32(&vc4->bcl, max_index); + cl_u32(&bcl, info->count); + cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset); + cl_u32(&bcl, vc4->max_index); if (vc4->indexbuf.index_size == 4) pipe_resource_reference(&prsc, NULL); } else { - cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); - cl_u8(&vc4->bcl, info->mode); - cl_u32(&vc4->bcl, info->count); - cl_u32(&vc4->bcl, info->start); + cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); + cl_u8(&bcl, info->mode); + cl_u32(&bcl, info->count); + cl_u32(&bcl, info->start); } + cl_end(&vc4->bcl, bcl); if (vc4->zsa && vc4->zsa->base.depth.enabled) { vc4->resolve |= PIPE_CLEAR_DEPTH; diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index 5f1ee4fa125..863ef8da8fb 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -31,12 +31,14 @@ #define DRM_VC4_WAIT_BO 0x02 #define DRM_VC4_CREATE_BO 0x03 #define DRM_VC4_MMAP_BO 0x04 +#define DRM_VC4_CREATE_SHADER_BO 0x05 #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) #define DRM_IOCTL_VC4_WAIT_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo) #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) +#define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) struct drm_vc4_submit_rcl_surface { uint32_t hindex; /* Handle index, or ~0 if not present. */ @@ -183,6 +185,29 @@ struct drm_vc4_create_bo { }; /** + * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4 + * shader BOs. + * + * Since allowing a shader to be overwritten while it's also being + * executed from would allow privlege escalation, shaders must be + * created using this ioctl, and they can't be mmapped later. + */ +struct drm_vc4_create_shader_bo { + /* Size of the data argument. */ + uint32_t size; + /* Flags, currently must be 0. */ + uint32_t flags; + + /* Pointer to the data. */ + uint64_t data; + + /** Returned GEM handle for the BO. */ + uint32_t handle; + /* Pad, must be 0. */ + uint32_t pad; +}; + +/** * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs. * * This doesn't actually perform an mmap. Instead, it returns the diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c index d2b54fccf91..ba064ff889b 100644 --- a/src/gallium/drivers/vc4/vc4_emit.c +++ b/src/gallium/drivers/vc4/vc4_emit.c @@ -28,23 +28,24 @@ vc4_emit_state(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) { float *vpscale = vc4->viewport.scale; float *vptranslate = vc4->viewport.translate; - float vp_minx = -fabs(vpscale[0]) + vptranslate[0]; - float vp_maxx = fabs(vpscale[0]) + vptranslate[0]; - float vp_miny = -fabs(vpscale[1]) + vptranslate[1]; - float vp_maxy = fabs(vpscale[1]) + vptranslate[1]; + float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; + float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; + float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; + float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; uint32_t minx = MAX2(vc4->scissor.minx, vp_minx); uint32_t miny = MAX2(vc4->scissor.miny, vp_miny); uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx); uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy); - cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW); - cl_u16(&vc4->bcl, minx); - cl_u16(&vc4->bcl, miny); - cl_u16(&vc4->bcl, maxx - minx); - cl_u16(&vc4->bcl, maxy - miny); + cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW); + cl_u16(&bcl, minx); + cl_u16(&bcl, miny); + cl_u16(&bcl, maxx - minx); + cl_u16(&bcl, maxy - miny); vc4->draw_min_x = MIN2(vc4->draw_min_x, minx); vc4->draw_min_y = MIN2(vc4->draw_min_y, miny); @@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx) } if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) { - cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS); - cl_u8(&vc4->bcl, + cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS); + cl_u8(&bcl, vc4->rasterizer->config_bits[0] | vc4->zsa->config_bits[0]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[1] | vc4->zsa->config_bits[1]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[2] | vc4->zsa->config_bits[2]); } if (vc4->dirty & VC4_DIRTY_RASTERIZER) { - cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_units); + cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET); + cl_u16(&bcl, vc4->rasterizer->offset_factor); + cl_u16(&bcl, vc4->rasterizer->offset_units); - cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE); - cl_f(&vc4->bcl, vc4->rasterizer->point_size); + cl_u8(&bcl, VC4_PACKET_POINT_SIZE); + cl_f(&bcl, vc4->rasterizer->point_size); - cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH); - cl_f(&vc4->bcl, vc4->rasterizer->base.line_width); + cl_u8(&bcl, VC4_PACKET_LINE_WIDTH); + cl_f(&bcl, vc4->rasterizer->base.line_width); } if (vc4->dirty & VC4_DIRTY_VIEWPORT) { - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING); - cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f); - cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f); + cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING); + cl_f(&bcl, vc4->viewport.scale[0] * 16.0f); + cl_f(&bcl, vc4->viewport.scale[1] * 16.0f); - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING); - cl_f(&vc4->bcl, vc4->viewport.translate[2]); - cl_f(&vc4->bcl, vc4->viewport.scale[2]); + cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING); + cl_f(&bcl, vc4->viewport.translate[2]); + cl_f(&bcl, vc4->viewport.scale[2]); - cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]); + cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET); + cl_u16(&bcl, 16 * vc4->viewport.translate[0]); + cl_u16(&bcl, 16 * vc4->viewport.translate[1]); } if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) { - cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS); - cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ? + cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS); + cl_u32(&bcl, vc4->rasterizer->base.flatshade ? vc4->prog.fs->color_inputs : 0); } + + cl_end(&vc4->bcl, bcl); } diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c index f2ee91de61a..b6fb2a8a460 100644 --- a/src/gallium/drivers/vc4/vc4_fence.c +++ b/src/gallium/drivers/vc4/vc4_fence.c @@ -60,16 +60,6 @@ vc4_fence_reference(struct pipe_screen *pscreen, } static boolean -vc4_fence_signalled(struct pipe_screen *pscreen, - struct pipe_fence_handle *pf) -{ - struct vc4_screen *screen = vc4_screen(pscreen); - struct vc4_fence *f = (struct vc4_fence *)pf; - - return vc4_wait_seqno(screen, f->seqno, 0); -} - -static boolean vc4_fence_finish(struct pipe_screen *pscreen, struct pipe_fence_handle *pf, uint64_t timeout_ns) @@ -77,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen, struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_fence *f = (struct vc4_fence *)pf; - return vc4_wait_seqno(screen, f->seqno, timeout_ns); + return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait"); } struct vc4_fence * @@ -98,6 +88,5 @@ void vc4_fence_init(struct vc4_screen *screen) { screen->base.fence_reference = vc4_fence_reference; - screen->base.fence_signalled = vc4_fence_signalled; screen->base.fence_finish = vc4_fence_finish; } diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c index 004bac70c67..ffce61237de 100644 --- a/src/gallium/drivers/vc4/vc4_formats.c +++ b/src/gallium/drivers/vc4/vc4_formats.c @@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = { static const struct vc4_format * get_format(enum pipe_format f) { - if (f > ARRAY_SIZE(vc4_format_table) || + if (f >= ARRAY_SIZE(vc4_format_table) || !vc4_format_table[f].present) return NULL; else diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index dcade15443a..7ebd9f160eb 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -44,8 +44,7 @@ void vc4_job_reset(struct vc4_context *vc4) { struct vc4_bo **referenced_bos = vc4->bo_pointers.base; - for (int i = 0; i < (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; i++) { + for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) { vc4_bo_unreference(&referenced_bos[i]); } vc4_reset_cl(&vc4->bcl); @@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4) { if (vc4_debug & VC4_DEBUG_CL) { fprintf(stderr, "BCL:\n"); - vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false); + vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false); } struct drm_vc4_submit_cl submit; @@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4) vc4->zs_write, true, true); submit.bo_handles = (uintptr_t)vc4->bo_handles.base; - submit.bo_handle_count = (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; + submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4; submit.bin_cl = (uintptr_t)vc4->bcl.base; - submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base; + submit.bin_cl_size = cl_offset(&vc4->bcl); submit.shader_rec = (uintptr_t)vc4->shader_rec.base; - submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base; + submit.shader_rec_size = cl_offset(&vc4->shader_rec); submit.shader_rec_count = vc4->shader_rec_count; submit.uniforms = (uintptr_t)vc4->uniforms.base; - submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base; + submit.uniforms_size = cl_offset(&vc4->uniforms); assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0); submit.min_x_tile = vc4->draw_min_x / 64; @@ -207,7 +205,7 @@ vc4_job_submit(struct vc4_context *vc4) if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) { if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno, - PIPE_TIMEOUT_INFINITE)) { + PIPE_TIMEOUT_INFINITE, "sync")) { fprintf(stderr, "Wait failed.\n"); abort(); } diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c new file mode 100644 index 00000000000..a372a6c0cdc --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -0,0 +1,431 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Implements most of the fixed function fragment pipeline in shader code. + * + * VC4 doesn't have any hardware support for blending, alpha test, logic ops, + * or color mask. Instead, you read the current contents of the destination + * from the tile buffer after having waited for the scoreboard (which is + * handled by vc4_qpu_emit.c), then do math using your output color and that + * destination value, and update the output color appropriately. + */ + +/** + * Lowers fixed-function blending to a load of the destination color and a + * series of ALU operations before the store of the output. + */ +#include "util/u_format.h" +#include "vc4_qir.h" +#include "glsl/nir/nir_builder.h" +#include "vc4_context.h" + +/** Emits a load of the previous fragment color from the tile buffer. */ +static nir_ssa_def * +vc4_nir_get_dst_color(nir_builder *b) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_input); + load->num_components = 1; + load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT; + nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static nir_ssa_def * +vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb) +{ + nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045)); + nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92)); + nir_ssa_def *high = nir_fpow(b, + nir_fmul(b, + nir_fadd(b, srgb, + nir_imm_float(b, 0.055)), + nir_imm_float(b, 1.0 / 1.055)), + nir_imm_float(b, 2.4)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear) +{ + nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308)); + nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92)); + nir_ssa_def *high = nir_fsub(b, + nir_fmul(b, + nir_imm_float(b, 1.055), + nir_fpow(b, + linear, + nir_imm_float(b, 0.41666))), + nir_imm_float(b, 0.055)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_blend_channel(nir_builder *b, + nir_ssa_def **src, + nir_ssa_def **dst, + unsigned factor, + int channel) +{ + switch(factor) { + case PIPE_BLENDFACTOR_ONE: + return nir_imm_float(b, 1.0); + case PIPE_BLENDFACTOR_SRC_COLOR: + return src[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return src[3]; + case PIPE_BLENDFACTOR_DST_ALPHA: + return dst[3]; + case PIPE_BLENDFACTOR_DST_COLOR: + return dst[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + if (channel != 3) { + return nir_fmin(b, + src[3], + nir_fsub(b, + nir_imm_float(b, 1.0), + dst[3])); + } else { + return nir_imm_float(b, 1.0); + } + case PIPE_BLENDFACTOR_CONST_COLOR: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel); + case PIPE_BLENDFACTOR_CONST_ALPHA: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W); + case PIPE_BLENDFACTOR_ZERO: + return nir_imm_float(b, 0.0); + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]); + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), src[3]); + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]); + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]); + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel)); + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W)); + + default: + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + /* Unsupported. */ + fprintf(stderr, "Unknown blend factor %d\n", factor); + return nir_imm_float(b, 1.0); + } +} + +static nir_ssa_def * +vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst, + unsigned func) +{ + switch (func) { + case PIPE_BLEND_ADD: + return nir_fadd(b, src, dst); + case PIPE_BLEND_SUBTRACT: + return nir_fsub(b, src, dst); + case PIPE_BLEND_REVERSE_SUBTRACT: + return nir_fsub(b, dst, src); + case PIPE_BLEND_MIN: + return nir_fmin(b, src, dst); + case PIPE_BLEND_MAX: + return nir_fmax(b, src, dst); + + default: + /* Unsupported. */ + fprintf(stderr, "Unknown blend func %d\n", func); + return src; + + } +} + +static void +vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result, + nir_ssa_def **src_color, nir_ssa_def **dst_color) +{ + struct pipe_rt_blend_state *blend = &c->fs_key->blend; + + if (!blend->blend_enable) { + for (int i = 0; i < 4; i++) + result[i] = src_color[i]; + return; + } + + /* Clamp the src color to [0, 1]. Dest is already clamped. */ + for (int i = 0; i < 4; i++) + src_color[i] = nir_fsat(b, src_color[i]); + + nir_ssa_def *src_blend[4], *dst_blend[4]; + for (int i = 0; i < 4; i++) { + int src_factor = ((i != 3) ? blend->rgb_src_factor : + blend->alpha_src_factor); + int dst_factor = ((i != 3) ? blend->rgb_dst_factor : + blend->alpha_dst_factor); + src_blend[i] = nir_fmul(b, src_color[i], + vc4_blend_channel(b, + src_color, dst_color, + src_factor, i)); + dst_blend[i] = nir_fmul(b, dst_color[i], + vc4_blend_channel(b, + src_color, dst_color, + dst_factor, i)); + } + + for (int i = 0; i < 4; i++) { + result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i], + ((i != 3) ? blend->rgb_func : + blend->alpha_func)); + } +} + +static nir_ssa_def * +vc4_logicop(nir_builder *b, int logicop_func, + nir_ssa_def *src, nir_ssa_def *dst) +{ + switch (logicop_func) { + case PIPE_LOGICOP_CLEAR: + return nir_imm_int(b, 0); + case PIPE_LOGICOP_NOR: + return nir_inot(b, nir_ior(b, src, dst)); + case PIPE_LOGICOP_AND_INVERTED: + return nir_iand(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_COPY_INVERTED: + return nir_inot(b, src); + case PIPE_LOGICOP_AND_REVERSE: + return nir_iand(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_INVERT: + return nir_inot(b, dst); + case PIPE_LOGICOP_XOR: + return nir_ixor(b, src, dst); + case PIPE_LOGICOP_NAND: + return nir_inot(b, nir_iand(b, src, dst)); + case PIPE_LOGICOP_AND: + return nir_iand(b, src, dst); + case PIPE_LOGICOP_EQUIV: + return nir_inot(b, nir_ixor(b, src, dst)); + case PIPE_LOGICOP_NOOP: + return dst; + case PIPE_LOGICOP_OR_INVERTED: + return nir_ior(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_OR_REVERSE: + return nir_ior(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_OR: + return nir_ior(b, src, dst); + case PIPE_LOGICOP_SET: + return nir_imm_int(b, ~0); + default: + fprintf(stderr, "Unknown logic op %d\n", logicop_func); + /* FALLTHROUGH */ + case PIPE_LOGICOP_COPY: + return src; + } +} + +static nir_ssa_def * +vc4_nir_pipe_compare_func(nir_builder *b, int func, + nir_ssa_def *src0, nir_ssa_def *src1) +{ + switch (func) { + default: + fprintf(stderr, "Unknown compare func %d\n", func); + /* FALLTHROUGH */ + case PIPE_FUNC_NEVER: + return nir_imm_int(b, 0); + case PIPE_FUNC_ALWAYS: + return nir_imm_int(b, ~0); + case PIPE_FUNC_EQUAL: + return nir_feq(b, src0, src1); + case PIPE_FUNC_NOTEQUAL: + return nir_fne(b, src0, src1); + case PIPE_FUNC_GREATER: + return nir_flt(b, src1, src0); + case PIPE_FUNC_GEQUAL: + return nir_fge(b, src0, src1); + case PIPE_FUNC_LESS: + return nir_flt(b, src0, src1); + case PIPE_FUNC_LEQUAL: + return nir_fge(b, src1, src0); + } +} + +static void +vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b, + nir_ssa_def *alpha) +{ + if (!c->fs_key->alpha_test) + return; + + nir_ssa_def *alpha_ref = + vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF); + nir_ssa_def *condition = + vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func, + alpha, alpha_ref); + + nir_intrinsic_instr *discard = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_discard_if); + discard->num_components = 1; + discard->src[0] = nir_src_for_ssa(nir_inot(b, condition)); + nir_builder_instr_insert(b, &discard->instr); +} + +static void +vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + enum pipe_format color_format = c->fs_key->color_format; + const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); + + /* Pull out the float src/dst color components. */ + nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b); + nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color); + nir_ssa_def *src_color[4], *unpacked_dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false); + unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false); + } + + /* Unswizzle the destination color. */ + nir_ssa_def *dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + dst_color[i] = vc4_nir_get_swizzled_channel(b, + unpacked_dst_color, + format_swiz[i]); + } + + vc4_nir_emit_alpha_test_discard(c, b, src_color[3]); + + /* Turn dst color to linear. */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]); + } + + nir_ssa_def *blend_color[4]; + vc4_do_blending(c, b, blend_color, src_color, dst_color); + + /* sRGB encode the output color */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]); + } + + nir_ssa_def *swizzled_outputs[4]; + for (int i = 0; i < 4; i++) { + swizzled_outputs[i] = + vc4_nir_get_swizzled_channel(b, blend_color, + format_swiz[i]); + } + + nir_ssa_def *packed_color = + nir_pack_unorm_4x8(b, + nir_vec4(b, + swizzled_outputs[0], + swizzled_outputs[1], + swizzled_outputs[2], + swizzled_outputs[3])); + + packed_color = vc4_logicop(b, c->fs_key->logicop_func, + packed_color, packed_dst_color); + + /* If the bit isn't set in the color mask, then just return the + * original dst color, instead. + */ + uint32_t colormask = 0xffffffff; + for (int i = 0; i < 4; i++) { + if (format_swiz[i] < 4 && + !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { + colormask &= ~(0xff << (i * 8)); + } + } + packed_color = nir_ior(b, + nir_iand(b, packed_color, + nir_imm_int(b, colormask)), + nir_iand(b, packed_dst_color, + nir_imm_int(b, ~colormask))); + + /* Turn the old vec4 output into a store of the packed color. */ + nir_instr_rewrite_src(&intr->instr, &intr->src[0], + nir_src_for_ssa(packed_color)); + intr->num_components = 1; +} + +static bool +vc4_nir_lower_blend_block(nir_block *block, void *state) +{ + struct vc4_compile *c = state; + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_variable *output_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->outputs) { + if (var->data.driver_location == intr->const_index[0]) { + output_var = var; + break; + } + } + assert(output_var); + unsigned semantic_name = output_var->data.location; + + if (semantic_name != TGSI_SEMANTIC_COLOR) + continue; + + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + nir_builder b; + nir_builder_init(&b, impl); + nir_builder_insert_before_instr(&b, &intr->instr); + vc4_nir_lower_blend_instr(c, &b, intr); + } + return true; +} + +void +vc4_nir_lower_blend(struct vc4_compile *c) +{ + nir_foreach_overload(c->s, overload) { + if (overload->impl) { + nir_foreach_block(overload->impl, + vc4_nir_lower_blend_block, c); + + nir_metadata_preserve(overload->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c new file mode 100644 index 00000000000..229d41147d8 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -0,0 +1,291 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "vc4_qir.h" +#include "tgsi/tgsi_info.h" +#include "glsl/nir/nir_builder.h" + +/** + * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into + * something amenable to the VC4 architecture. + * + * Currently, it split inputs, outputs, and uniforms into scalars, drops any + * non-position outputs in coordinate shaders, and fixes up the addressing on + * indirect uniform loads. + */ + +static void +replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr, + nir_ssa_def **comps) +{ + + /* Batch things back together into a vec4. This will get split by the + * later ALU scalarization pass. + */ + nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]); + + /* Replace the old intrinsic with a reference to our reconstructed + * vec4. + */ + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec), + ralloc_parent(b->impl)); + nir_instr_remove(&intr->instr); +} + +static void +vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_builder_insert_before_instr(b, &intr->instr); + + if (c->stage == QSTAGE_FRAG && intr->const_index[0] == + VC4_NIR_TLB_COLOR_READ_INPUT) { + /* This doesn't need any lowering. */ + return; + } + + nir_variable *input_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->inputs) { + if (var->data.driver_location == intr->const_index[0]) { + input_var = var; + break; + } + } + assert(input_var); + int semantic_name = input_var->data.location; + int semantic_index = input_var->data.index; + + /* All TGSI-to-NIR inputs are vec4. */ + assert(intr->num_components == 4); + + /* Generate scalar loads equivalent to the original VEC4. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input); + intr_comp->num_components = 1; + intr_comp->const_index[0] = intr->const_index[0] * 4 + i; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_builder_instr_insert(b, &intr_comp->instr); + + dests[i] = &intr_comp->dest.ssa; + } + + switch (c->stage) { + case QSTAGE_FRAG: + switch (semantic_name) { + case TGSI_SEMANTIC_FACE: + dests[0] = nir_fsub(b, + nir_imm_float(b, 1.0), + nir_fmul(b, + nir_i2f(b, dests[0]), + nir_imm_float(b, 2.0))); + dests[1] = nir_imm_float(b, 0.0); + dests[2] = nir_imm_float(b, 0.0); + dests[3] = nir_imm_float(b, 1.0); + break; + case TGSI_SEMANTIC_GENERIC: + if (c->fs_key->point_sprite_mask & + (1 << semantic_index)) { + if (!c->fs_key->is_points) { + dests[0] = nir_imm_float(b, 0.0); + dests[1] = nir_imm_float(b, 0.0); + } + if (c->fs_key->point_coord_upper_left) { + dests[1] = nir_fsub(b, + nir_imm_float(b, 1.0), + dests[1]); + } + dests[2] = nir_imm_float(b, 0.0); + dests[3] = nir_imm_float(b, 1.0); + } + break; + } + break; + case QSTAGE_COORD: + case QSTAGE_VERT: + break; + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + +static void +vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_variable *output_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->outputs) { + if (var->data.driver_location == intr->const_index[0]) { + output_var = var; + break; + } + } + assert(output_var); + unsigned semantic_name = output_var->data.location; + + if (c->stage == QSTAGE_COORD && + (semantic_name != TGSI_SEMANTIC_POSITION && + semantic_name != TGSI_SEMANTIC_PSIZE)) { + nir_instr_remove(&intr->instr); + return; + } + + /* Color output is lowered by vc4_nir_lower_blend(). */ + if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) { + intr->const_index[0] *= 4; + return; + } + + /* All TGSI-to-NIR outputs are VEC4. */ + assert(intr->num_components == 4); + + nir_builder_insert_before_instr(b, &intr->instr); + + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output); + intr_comp->num_components = 1; + intr_comp->const_index[0] = intr->const_index[0] * 4 + i; + + assert(intr->src[0].is_ssa); + intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b, + intr->src[0].ssa, + &i, 1, false)); + nir_builder_instr_insert(b, &intr_comp->instr); + } + + nir_instr_remove(&intr->instr); +} + +static void +vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* All TGSI-to-NIR uniform loads are vec4, but we may create dword + * loads in our lowering passes. + */ + if (intr->num_components == 1) + return; + assert(intr->num_components == 4); + + nir_builder_insert_before_instr(b, &intr->instr); + + /* Generate scalar loads equivalent to the original VEC4. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, intr->intrinsic); + intr_comp->num_components = 1; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + + if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) { + /* Convert the variable TGSI register index to a byte + * offset. + */ + intr_comp->src[0] = + nir_src_for_ssa(nir_ishl(b, + intr->src[0].ssa, + nir_imm_int(b, 4))); + + /* Convert the offset to be a byte index, too. */ + intr_comp->const_index[0] = (intr->const_index[0] * 16 + + i * 4); + } else { + /* We want a dword index for non-indirect uniform + * loads. + */ + intr_comp->const_index[0] = (intr->const_index[0] * 4 + + i); + } + + dests[i] = &intr_comp->dest.ssa; + + nir_builder_instr_insert(b, &intr_comp->instr); + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + +static void +vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, + struct nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + vc4_nir_lower_input(c, b, intr); + break; + + case nir_intrinsic_store_output: + vc4_nir_lower_output(c, b, intr); + break; + + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_uniform_indirect: + vc4_nir_lower_uniform(c, b, intr); + break; + + default: + break; + } +} + +static bool +vc4_nir_lower_io_block(nir_block *block, void *arg) +{ + struct vc4_compile *c = arg; + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_instr_safe(block, instr) + vc4_nir_lower_io_instr(c, &b, instr); + + return true; +} + +static bool +vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl) +{ + nir_foreach_block(impl, vc4_nir_lower_io_block, c); + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + return true; +} + +void +vc4_nir_lower_io(struct vc4_compile *c) +{ + nir_foreach_overload(c->s, overload) { + if (overload->impl) + vc4_nir_lower_io_impl(c, overload->impl); + } +} diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index d6d2fbf257f..a755de9aa41 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c) if (inst->op == QOP_MOV && inst->dst.file == QFILE_TEMP && - inst->src[0].file != QFILE_VPM && - !(inst->src[0].file == QFILE_TEMP && - (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT || - c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) { + inst->src[0].file != QFILE_VPM) { movs[inst->dst.index] = inst->src[0]; } } diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c index 92c8260eb59..0e5480ea781 100644 --- a/src/gallium/drivers/vc4/vc4_opt_cse.c +++ b/src/gallium/drivers/vc4/vc4_opt_cse.c @@ -46,8 +46,7 @@ struct inst_key { struct qreg src[4]; /** * If the instruction depends on the flags, how many SFs have been - * seen before this instruction, or if it depends on r4, how many r4 - * writes have been seen. + * seen before this instruction. */ uint32_t implicit_arg_update_count; }; @@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b) static struct qinst * vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, - struct qinst *inst, uint32_t sf_count, - uint32_t r4_count) + struct qinst *inst, uint32_t sf_count) { if (inst->dst.file != QFILE_TEMP || inst->op == QOP_MOV || @@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, qir_get_op_nsrc(inst->op) * sizeof(key.src[0])); if (qir_depends_on_flags(inst)) key.implicit_arg_update_count = sf_count; - if (qir_reads_r4(inst)) - key.implicit_arg_update_count = r4_count; uint32_t hash = _mesa_hash_data(&key, sizeof(key)); struct hash_entry *entry = @@ -121,7 +117,7 @@ bool qir_opt_cse(struct vc4_compile *c) { bool progress = false; - uint32_t sf_count = 0, r4_count = 0; + uint32_t sf_count = 0; struct hash_table *ht = _mesa_hash_table_create(NULL, NULL, inst_key_equals); @@ -130,15 +126,15 @@ qir_opt_cse(struct vc4_compile *c) list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (qir_has_side_effects(c, inst) || - qir_has_side_effect_reads(c, inst)) { + qir_has_side_effect_reads(c, inst) || + inst->op == QOP_TLB_COLOR_READ) { continue; } if (inst->sf) { sf_count++; } else { - struct qinst *cse = vc4_find_cse(c, ht, inst, - sf_count, r4_count); + struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count); if (cse) { inst->src[0] = cse->dst; for (int i = 1; i < qir_get_op_nsrc(inst->op); @@ -154,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c) } } } - - if (qir_writes_r4(inst)) - r4_count++; } ralloc_free(ht); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ba47c51d9bd..13c472152d8 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -23,21 +23,19 @@ */ #include <inttypes.h> -#include "pipe/p_state.h" #include "util/u_format.h" #include "util/u_hash.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_pack_color.h" -#include "util/format_srgb.h" #include "util/ralloc.h" #include "util/hash_table.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_info.h" #include "tgsi/tgsi_lowering.h" #include "tgsi/tgsi_parse.h" +#include "glsl/nir/nir.h" +#include "glsl/nir/nir_builder.h" #include "nir/tgsi_to_nir.h" - #include "vc4_context.h" #include "vc4_qpu.h" #include "vc4_qir.h" @@ -45,51 +43,8 @@ #include "simpenrose/simpenrose.h" #endif -struct vc4_key { - struct vc4_uncompiled_shader *shader_state; - struct { - enum pipe_format format; - unsigned compare_mode:1; - unsigned compare_func:3; - unsigned wrap_s:3; - unsigned wrap_t:3; - uint8_t swizzle[4]; - } tex[VC4_MAX_TEXTURE_SAMPLERS]; - uint8_t ucp_enables; -}; - -struct vc4_fs_key { - struct vc4_key base; - enum pipe_format color_format; - bool depth_enabled; - bool stencil_enabled; - bool stencil_twoside; - bool stencil_full_writemasks; - bool is_points; - bool is_lines; - bool alpha_test; - bool point_coord_upper_left; - bool light_twoside; - uint8_t alpha_test_func; - uint8_t logicop_func; - uint32_t point_sprite_mask; - - struct pipe_rt_blend_state blend; -}; - -struct vc4_vs_key { - struct vc4_key base; - - /** - * This is a proxy for the array of FS input semantics, which is - * larger than we would want to put in the key. - */ - uint64_t compiled_fs_id; - - enum pipe_format attr_formats[8]; - bool is_coord; - bool per_vertex_point_size; -}; +static struct qreg +ntq_get_src(struct vc4_compile *c, nir_src src, int i); static void resize_qreg_array(struct vc4_compile *c, @@ -113,10 +68,10 @@ resize_qreg_array(struct vc4_compile *c, } static struct qreg -indirect_uniform_load(struct vc4_compile *c, - struct qreg indirect_offset, - unsigned offset) +indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) { + struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); + uint32_t offset = intr->const_index[0]; struct vc4_compiler_ubo_range *range = NULL; unsigned i; for (i = 0; i < c->num_uniform_ranges; i++) { @@ -138,10 +93,6 @@ indirect_uniform_load(struct vc4_compile *c, }; offset -= range->src_offset; - /* Translate the user's TGSI register index from the TGSI register - * base to a byte offset. - */ - indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4)); /* Adjust for where we stored the TGSI register base. */ indirect_offset = qir_ADD(c, indirect_offset, @@ -155,24 +106,70 @@ indirect_uniform_load(struct vc4_compile *c, range->size - 4))); qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); - struct qreg r4 = qir_TEX_RESULT(c); c->num_texture_samples++; - return qir_MOV(c, r4); + return qir_TEX_RESULT(c); } -static struct qreg * -ntq_get_dest(struct vc4_compile *c, nir_dest dest) +nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, + enum quniform_contents contents) { - assert(!dest.is_ssa); - nir_register *reg = dest.reg.reg; - struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg); - assert(reg->num_array_elems == 0); - assert(dest.reg.base_offset == 0); + nir_intrinsic_instr *intr = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_uniform); + intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents; + intr->num_components = 1; + nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL); + nir_builder_instr_insert(b, &intr->instr); + return &intr->dest.ssa; +} - struct qreg *qregs = entry->data; +nir_ssa_def * +vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +{ + switch (swiz) { + default: + case UTIL_FORMAT_SWIZZLE_NONE: + fprintf(stderr, "warning: unknown swizzle\n"); + /* FALLTHROUGH */ + case UTIL_FORMAT_SWIZZLE_0: + return nir_imm_float(b, 0.0); + case UTIL_FORMAT_SWIZZLE_1: + return nir_imm_float(b, 1.0); + case UTIL_FORMAT_SWIZZLE_X: + case UTIL_FORMAT_SWIZZLE_Y: + case UTIL_FORMAT_SWIZZLE_Z: + case UTIL_FORMAT_SWIZZLE_W: + return srcs[swiz]; + } +} + +static struct qreg * +ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) +{ + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + def->num_components); + _mesa_hash_table_insert(c->def_ht, def, qregs); return qregs; } +static struct qreg * +ntq_get_dest(struct vc4_compile *c, nir_dest *dest) +{ + if (dest->is_ssa) { + struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa); + for (int i = 0; i < dest->ssa.num_components; i++) + qregs[i] = c->undef; + return qregs; + } else { + nir_register *reg = dest->reg.reg; + assert(dest->reg.base_offset == 0); + assert(reg->num_array_elems == 0); + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, reg); + return entry->data; + } +} + static struct qreg ntq_get_src(struct vc4_compile *c, nir_src src, int i) { @@ -282,22 +279,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb) } static struct qreg -qir_srgb_encode(struct vc4_compile *c, struct qreg linear) -{ - struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92)); - struct qreg high = qir_FSUB(c, - qir_FMUL(c, - qir_uniform_f(c, 1.055), - qir_POW(c, - linear, - qir_uniform_f(c, 0.41666))), - qir_uniform_f(c, 0.055)); - - qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308))); - return qir_SEL_X_Y_NS(c, low, high); -} - -static struct qreg ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) { struct qreg src0_hi = qir_SHR(c, src0, @@ -410,13 +391,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) qir_TEX_S(c, s, texture_u[next_texture_u++]); c->num_texture_samples++; - struct qreg r4 = qir_TEX_RESULT(c); + struct qreg tex = qir_TEX_RESULT(c); enum pipe_format format = c->key->tex[unit].format; struct qreg unpacked[4]; if (util_format_is_depth_or_stencil(format)) { - struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4, + struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex, qir_uniform_ui(c, 8))); struct qreg normalized = qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); @@ -468,7 +449,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) unpacked[i] = depth_output; } else { for (int i = 0; i < 4; i++) - unpacked[i] = qir_R4_UNPACK(c, r4, i); + unpacked[i] = qir_UNPACK_8_F(c, tex, i); } const uint8_t *format_swiz = vc4_get_format_swizzle(format); @@ -484,7 +465,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) texture_output[i]); } - struct qreg *dest = ntq_get_dest(c, instr->dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest); for (int i = 0; i < 4; i++) { dest[i] = get_swizzled_channel(c, texture_output, c->key->tex[unit].swizzle[i]); @@ -558,7 +539,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src) struct qreg scaled_x = qir_FMUL(c, src, - qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); + qir_uniform_f(c, 1.0 / (M_PI * 2.0))); struct qreg x = qir_FADD(c, ntq_ffract(c, scaled_x), @@ -756,26 +737,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr) c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c)); } -static void -emit_point_coord_input(struct vc4_compile *c, int attr) -{ - if (c->point_x.file == QFILE_NULL) { - c->point_x = qir_uniform_f(c, 0.0); - c->point_y = qir_uniform_f(c, 0.0); - } - - c->inputs[attr * 4 + 0] = c->point_x; - if (c->fs_key->point_coord_upper_left) { - c->inputs[attr * 4 + 1] = qir_FSUB(c, - qir_uniform_f(c, 1.0), - c->point_y); - } else { - c->inputs[attr * 4 + 1] = c->point_y; - } - c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0); -} - static struct qreg emit_fragment_varying(struct vc4_compile *c, uint8_t semantic, uint8_t index, uint8_t swizzle) @@ -817,19 +778,6 @@ emit_fragment_input(struct vc4_compile *c, int attr, } static void -emit_face_input(struct vc4_compile *c, int attr) -{ - c->inputs[attr * 4 + 0] = qir_FSUB(c, - qir_uniform_f(c, 1.0), - qir_FMUL(c, - qir_ITOF(c, qir_FRAG_REV_FLAG(c)), - qir_uniform_f(c, 2.0))); - c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0); -} - -static void add_output(struct vc4_compile *c, uint32_t decl_offset, uint8_t semantic_name, @@ -884,12 +832,38 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); - struct qreg *dest = ntq_get_dest(c, instr->dest.dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) dest[i] = srcs[i]; return; } + if (instr->op == nir_op_pack_unorm_4x8) { + struct qreg result; + for (int i = 0; i < 4; i++) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[i]); + if (i == 0) + result = qir_PACK_8888_F(c, src); + else + result = qir_PACK_8_F(c, result, src, i); + } + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + *dest = result; + return; + } + + if (instr->op == nir_op_unpack_unorm_4x8) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[0]); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + for (int i = 0; i < 4; i++) { + if (instr->dest.write_mask & (1 << i)) + dest[i] = qir_UNPACK_8_F(c, src, i); + } + return; + } + /* General case: We can just grab the one used channel per src. */ struct qreg src[nir_op_infos[instr->op].num_inputs]; for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { @@ -898,7 +872,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) /* Pick the channel to store the output in. */ assert(!instr->dest.saturate); - struct qreg *dest = ntq_get_dest(c, instr->dest.dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); assert(util_is_power_of_two(instr->dest.write_mask)); dest += ffs(instr->dest.write_mask) - 1; @@ -1092,167 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) } } -static struct qreg -vc4_blend_channel(struct vc4_compile *c, - struct qreg *dst, - struct qreg *src, - struct qreg val, - unsigned factor, - int channel) -{ - switch(factor) { - case PIPE_BLENDFACTOR_ONE: - return val; - case PIPE_BLENDFACTOR_SRC_COLOR: - return qir_FMUL(c, val, src[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA: - return qir_FMUL(c, val, src[3]); - case PIPE_BLENDFACTOR_DST_ALPHA: - return qir_FMUL(c, val, dst[3]); - case PIPE_BLENDFACTOR_DST_COLOR: - return qir_FMUL(c, val, dst[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - if (channel != 3) { - return qir_FMUL(c, - val, - qir_FMIN(c, - src[3], - qir_FSUB(c, - qir_uniform_f(c, 1.0), - dst[3]))); - } else { - return val; - } - case PIPE_BLENDFACTOR_CONST_COLOR: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, - channel)); - case PIPE_BLENDFACTOR_CONST_ALPHA: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3)); - case PIPE_BLENDFACTOR_ZERO: - return qir_uniform_f(c, 0.0); - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[channel])); - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[3])); - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[3])); - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[channel])); - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - channel))); - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - 3))); - - default: - case PIPE_BLENDFACTOR_SRC1_COLOR: - case PIPE_BLENDFACTOR_SRC1_ALPHA: - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - /* Unsupported. */ - fprintf(stderr, "Unknown blend factor %d\n", factor); - return val; - } -} - -static struct qreg -vc4_blend_func(struct vc4_compile *c, - struct qreg src, struct qreg dst, - unsigned func) -{ - switch (func) { - case PIPE_BLEND_ADD: - return qir_FADD(c, src, dst); - case PIPE_BLEND_SUBTRACT: - return qir_FSUB(c, src, dst); - case PIPE_BLEND_REVERSE_SUBTRACT: - return qir_FSUB(c, dst, src); - case PIPE_BLEND_MIN: - return qir_FMIN(c, src, dst); - case PIPE_BLEND_MAX: - return qir_FMAX(c, src, dst); - - default: - /* Unsupported. */ - fprintf(stderr, "Unknown blend func %d\n", func); - return src; - - } -} - -/** - * Implements fixed function blending in shader code. - * - * VC4 doesn't have any hardware support for blending. Instead, you read the - * current contents of the destination from the tile buffer after having - * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do - * math using your output color and that destination value, and update the - * output color appropriately. - */ -static void -vc4_blend(struct vc4_compile *c, struct qreg *result, - struct qreg *dst_color, struct qreg *src_color) -{ - struct pipe_rt_blend_state *blend = &c->fs_key->blend; - - if (!blend->blend_enable) { - for (int i = 0; i < 4; i++) - result[i] = src_color[i]; - return; - } - - struct qreg clamped_src[4]; - struct qreg clamped_dst[4]; - for (int i = 0; i < 4; i++) { - clamped_src[i] = qir_SAT(c, src_color[i]); - clamped_dst[i] = qir_SAT(c, dst_color[i]); - } - src_color = clamped_src; - dst_color = clamped_dst; - - struct qreg src_blend[4], dst_blend[4]; - for (int i = 0; i < 3; i++) { - src_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - src_color[i], - blend->rgb_src_factor, i); - dst_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[i], - blend->rgb_dst_factor, i); - } - src_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - src_color[3], - blend->alpha_src_factor, 3); - dst_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[3], - blend->alpha_dst_factor, 3); - - for (int i = 0; i < 3; i++) { - result[i] = vc4_blend_func(c, - src_blend[i], dst_blend[i], - blend->rgb_func); - } - result[3] = vc4_blend_func(c, - src_blend[3], dst_blend[3], - blend->alpha_func); -} - static void clip_distance_discard(struct vc4_compile *c) { @@ -1276,167 +1089,15 @@ clip_distance_discard(struct vc4_compile *c) } static void -alpha_test_discard(struct vc4_compile *c) -{ - struct qreg src_alpha; - struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0); - - if (!c->fs_key->alpha_test) - return; - - if (c->output_color_index != -1) - src_alpha = c->outputs[c->output_color_index + 3]; - else - src_alpha = qir_uniform_f(c, 1.0); - - if (c->discard.file == QFILE_NULL) - c->discard = qir_uniform_ui(c, 0); - - switch (c->fs_key->alpha_test_func) { - case PIPE_FUNC_NEVER: - c->discard = qir_uniform_ui(c, ~0); - break; - case PIPE_FUNC_ALWAYS: - break; - case PIPE_FUNC_EQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_NOTEQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GREATER: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LESS: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - } -} - -static struct qreg -vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst) -{ - switch (c->fs_key->logicop_func) { - case PIPE_LOGICOP_CLEAR: - return qir_uniform_f(c, 0.0); - case PIPE_LOGICOP_NOR: - return qir_NOT(c, qir_OR(c, src, dst)); - case PIPE_LOGICOP_AND_INVERTED: - return qir_AND(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_COPY_INVERTED: - return qir_NOT(c, src); - case PIPE_LOGICOP_AND_REVERSE: - return qir_AND(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_INVERT: - return qir_NOT(c, dst); - case PIPE_LOGICOP_XOR: - return qir_XOR(c, src, dst); - case PIPE_LOGICOP_NAND: - return qir_NOT(c, qir_AND(c, src, dst)); - case PIPE_LOGICOP_AND: - return qir_AND(c, src, dst); - case PIPE_LOGICOP_EQUIV: - return qir_NOT(c, qir_XOR(c, src, dst)); - case PIPE_LOGICOP_NOOP: - return dst; - case PIPE_LOGICOP_OR_INVERTED: - return qir_OR(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_OR_REVERSE: - return qir_OR(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_OR: - return qir_OR(c, src, dst); - case PIPE_LOGICOP_SET: - return qir_uniform_ui(c, ~0); - case PIPE_LOGICOP_COPY: - default: - return src; - } -} - -static void emit_frag_end(struct vc4_compile *c) { clip_distance_discard(c); - alpha_test_discard(c); - - enum pipe_format color_format = c->fs_key->color_format; - const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); - struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg packed_dst_color = c->undef; - - if (c->fs_key->blend.blend_enable || - c->fs_key->blend.colormask != 0xf || - c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - struct qreg r4 = qir_TLB_COLOR_READ(c); - for (int i = 0; i < 4; i++) - tlb_read_color[i] = qir_R4_UNPACK(c, r4, i); - for (int i = 0; i < 4; i++) { - dst_color[i] = get_swizzled_channel(c, - tlb_read_color, - format_swiz[i]); - if (util_format_is_srgb(color_format) && i != 3) { - linear_dst_color[i] = - qir_srgb_decode(c, dst_color[i]); - } else { - linear_dst_color[i] = dst_color[i]; - } - } - /* Save the packed value for logic ops. Can't reuse r4 - * because other things might smash it (like sRGB) - */ - packed_dst_color = qir_MOV(c, r4); - } - - struct qreg blend_color[4]; - struct qreg undef_array[4] = { - c->undef, c->undef, c->undef, c->undef - }; - vc4_blend(c, blend_color, linear_dst_color, - (c->output_color_index != -1 ? - c->outputs + c->output_color_index : - undef_array)); - - if (util_format_is_srgb(color_format)) { - for (int i = 0; i < 3; i++) - blend_color[i] = qir_srgb_encode(c, blend_color[i]); - } - - /* Debug: Sometimes you're getting a black output and just want to see - * if the FS is getting executed at all. Spam magenta into the color - * output. - */ - if (0) { - blend_color[0] = qir_uniform_f(c, 1.0); - blend_color[1] = qir_uniform_f(c, 0.0); - blend_color[2] = qir_uniform_f(c, 1.0); - blend_color[3] = qir_uniform_f(c, 0.5); - } - - struct qreg swizzled_outputs[4]; - for (int i = 0; i < 4; i++) { - swizzled_outputs[i] = get_swizzled_channel(c, blend_color, - format_swiz[i]); + struct qreg color; + if (c->output_color_index != -1) { + color = c->outputs[c->output_color_index]; + } else { + color = qir_uniform_ui(c, 0); } if (c->discard.file != QFILE_NULL) @@ -1463,47 +1124,7 @@ emit_frag_end(struct vc4_compile *c) qir_TLB_Z_WRITE(c, z); } - struct qreg packed_color = c->undef; - for (int i = 0; i < 4; i++) { - if (swizzled_outputs[i].file == QFILE_NULL) - continue; - if (packed_color.file == QFILE_NULL) { - packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]); - } else { - packed_color = qir_PACK_8_F(c, - packed_color, - swizzled_outputs[i], - i); - } - } - - if (packed_color.file == QFILE_NULL) - packed_color = qir_uniform_ui(c, 0); - - if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - packed_color = vc4_logicop(c, packed_color, packed_dst_color); - } - - /* If the bit isn't set in the color mask, then just return the - * original dst color, instead. - */ - uint32_t colormask = 0xffffffff; - for (int i = 0; i < 4; i++) { - if (format_swiz[i] < 4 && - !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { - colormask &= ~(0xff << (i * 8)); - } - } - if (colormask != 0xffffffff) { - packed_color = qir_OR(c, - qir_AND(c, packed_color, - qir_uniform_ui(c, colormask)), - qir_AND(c, packed_dst_color, - qir_uniform_ui(c, ~colormask))); - } - - qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef, - packed_color, c->undef)); + qir_TLB_COLOR_WRITE(c, color); } static void @@ -1695,6 +1316,7 @@ vc4_optimize_nir(struct nir_shader *s) progress = nir_opt_peephole_select(s) || progress; progress = nir_opt_algebraic(s) || progress; progress = nir_opt_constant_folding(s) || progress; + progress = nir_opt_undef(s) || progress; } while (progress); } @@ -1736,6 +1358,7 @@ ntq_setup_inputs(struct vc4_compile *c) unsigned loc = var->data.driver_location; assert(array_len == 1); + (void)array_len; resize_qreg_array(c, &c->inputs, &c->inputs_array_size, (loc + 1) * 4); @@ -1743,11 +1366,12 @@ ntq_setup_inputs(struct vc4_compile *c) if (semantic_name == TGSI_SEMANTIC_POSITION) { emit_fragcoord_input(c, loc); } else if (semantic_name == TGSI_SEMANTIC_FACE) { - emit_face_input(c, loc); + c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c); } else if (semantic_name == TGSI_SEMANTIC_GENERIC && (c->fs_key->point_sprite_mask & (1 << semantic_index))) { - emit_point_coord_input(c, loc); + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; } else { emit_fragment_input(c, loc, semantic_name, @@ -1770,6 +1394,13 @@ ntq_setup_outputs(struct vc4_compile *c) unsigned loc = var->data.driver_location * 4; assert(array_len == 1); + (void)array_len; + + /* NIR hack to pass through + * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */ + if (semantic_name == TGSI_SEMANTIC_COLOR && + semantic_index == -1) + semantic_index = 0; for (int i = 0; i < 4; i++) { add_output(c, @@ -1834,8 +1465,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list) static void ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) { - struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, - instr->def.num_components); + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) qregs[i] = qir_uniform_ui(c, instr->value.u[i]); @@ -1843,47 +1473,59 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) } static void +ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + + /* QIR needs there to be *some* value, so pick 0 (same as for + * ntq_setup_registers(). + */ + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = qir_uniform_ui(c, 0); +} + +static void ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; struct qreg *dest = NULL; if (info->has_dest) { - dest = ntq_get_dest(c, instr->dest); + dest = ntq_get_dest(c, &instr->dest); } switch (instr->intrinsic) { case nir_intrinsic_load_uniform: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = qir_uniform(c, QUNIFORM_UNIFORM, - instr->const_index[0] * 4 + i); + assert(instr->num_components == 1); + if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) { + *dest = qir_uniform(c, QUNIFORM_UNIFORM, + instr->const_index[0]); + } else { + *dest = qir_uniform(c, instr->const_index[0] - + VC4_NIR_STATE_UNIFORM_OFFSET, + 0); } break; case nir_intrinsic_load_uniform_indirect: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = indirect_uniform_load(c, - ntq_get_src(c, instr->src[0], 0), - (instr->const_index[0] * - 4 + i) * sizeof(float)); - } + *dest = indirect_uniform_load(c, instr); break; case nir_intrinsic_load_input: - for (int i = 0; i < instr->num_components; i++) - dest[i] = c->inputs[instr->const_index[0] * 4 + i]; - + assert(instr->num_components == 1); + if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { + *dest = qir_TLB_COLOR_READ(c); + } else { + *dest = c->inputs[instr->const_index[0]]; + } break; case nir_intrinsic_store_output: - for (int i = 0; i < instr->num_components; i++) { - c->outputs[instr->const_index[0] * 4 + i] = - qir_MOV(c, ntq_get_src(c, instr->src[0], i)); - } - c->num_outputs = MAX2(c->num_outputs, - instr->const_index[0] * 4 + - instr->num_components + 1); + assert(instr->num_components == 1); + c->outputs[instr->const_index[0]] = + qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); + c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1); break; case nir_intrinsic_discard: @@ -1927,6 +1569,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr) ntq_emit_load_const(c, nir_instr_as_load_const(instr)); break; + case nir_instr_type_ssa_undef: + ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_tex: ntq_emit_tex(c, nir_instr_as_tex(instr)); break; @@ -2084,13 +1730,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, c->s = tgsi_to_nir(tokens, &nir_options); nir_opt_global_to_local(c->s); nir_convert_to_ssa(c->s); + if (stage == QSTAGE_FRAG) + vc4_nir_lower_blend(c); + vc4_nir_lower_io(c); nir_lower_idiv(c->s); + nir_lower_load_const_to_scalar(c->s); vc4_optimize_nir(c->s); nir_remove_dead_variables(c->s); - nir_convert_from_ssa(c->s); + nir_convert_from_ssa(c->s, true); if (vc4_debug & VC4_DEBUG_SHADERDB) { fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", @@ -2187,6 +1837,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, memcpy(uinfo->contents, c->uniform_contents, count * sizeof(*uinfo->contents)); uinfo->num_texture_samples = c->num_texture_samples; + + vc4_set_shader_uniform_dirty_flags(shader); } static struct vc4_compiled_shader * @@ -2259,9 +1911,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } copy_uniform_state_to_shader(shader, c); - shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts, - c->qpu_inst_count * sizeof(uint64_t), - "code"); + shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts, + c->qpu_inst_count * sizeof(uint64_t)); /* Copy the compiler UBO range state to the compiled shader, dropping * out arrays that were never referenced by an indirect load. @@ -2288,10 +1939,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } } if (shader->ubo_size) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - qir_get_stage_name(c->stage), - c->program_id, c->variant_id, - shader->ubo_size / 4); + if (vc4_debug & VC4_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", + qir_get_stage_name(c->stage), + c->program_id, c->variant_id, + shader->ubo_size / 4); + } } qir_compile_destroy(c); @@ -2421,9 +2074,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) (prim_mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex); - vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + struct vc4_compiled_shader *vs = + vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + if (vs != vc4->prog.vs) { + vc4->prog.vs = vs; + vc4->dirty |= VC4_DIRTY_COMPILED_VS; + } + key->is_coord = true; - vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + struct vc4_compiled_shader *cs = + vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + if (cs != vc4->prog.cs) { + vc4->prog.cs = cs; + vc4->dirty |= VC4_DIRTY_COMPILED_CS; + } } void @@ -2490,305 +2154,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) free(so); } -static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest) -{ - switch (p_wrap) { - case PIPE_TEX_WRAP_REPEAT: - return 0; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return 1; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return 2; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - return 3; - case PIPE_TEX_WRAP_CLAMP: - return (using_nearest ? 1 : 3); - default: - fprintf(stderr, "Unknown wrap mode %d\n", p_wrap); - assert(!"not reached"); - return 0; - } -} - -static void -write_texture_p0(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - - cl_reloc(vc4, &vc4->uniforms, rsc->bo, - VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | - VC4_SET_FIELD(texture->u.tex.last_level - - texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) | - VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE, - VC4_TEX_P0_CMMODE) | - VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE)); -} - -static void -write_texture_p1(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - struct pipe_sampler_state *sampler = texstate->samplers[unit]; - static const uint8_t minfilter_map[6] = { - VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR, - VC4_TEX_P1_MINFILT_LIN_MIP_NEAR, - VC4_TEX_P1_MINFILT_NEAR_MIP_LIN, - VC4_TEX_P1_MINFILT_LIN_MIP_LIN, - VC4_TEX_P1_MINFILT_NEAREST, - VC4_TEX_P1_MINFILT_LINEAR, - }; - static const uint32_t magfilter_map[] = { - [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST, - [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR, - }; - - bool either_nearest = - (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST || - sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST); - - cl_aligned_u32(&vc4->uniforms, - VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) | - VC4_SET_FIELD(texture->texture->height0 & 2047, - VC4_TEX_P1_HEIGHT) | - VC4_SET_FIELD(texture->texture->width0 & 2047, - VC4_TEX_P1_WIDTH) | - VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter], - VC4_TEX_P1_MAGFILT) | - VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 + - sampler->min_img_filter], - VC4_TEX_P1_MINFILT) | - VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest), - VC4_TEX_P1_WRAP_S) | - VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest), - VC4_TEX_P1_WRAP_T)); -} - -static void -write_texture_p2(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t data) -{ - uint32_t unit = data & 0xffff; - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - - cl_aligned_u32(&vc4->uniforms, - VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE, - VC4_TEX_P2_PTYPE) | - VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) | - VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD)); -} - - -#define SWIZ(x,y,z,w) { \ - UTIL_FORMAT_SWIZZLE_##x, \ - UTIL_FORMAT_SWIZZLE_##y, \ - UTIL_FORMAT_SWIZZLE_##z, \ - UTIL_FORMAT_SWIZZLE_##w \ -} - -static void -write_texture_border_color(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_state *sampler = texstate->samplers[unit]; - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - union util_color uc; - - const struct util_format_description *tex_format_desc = - util_format_description(texture->format); - - float border_color[4]; - for (int i = 0; i < 4; i++) - border_color[i] = sampler->border_color.f[i]; - if (util_format_is_srgb(texture->format)) { - for (int i = 0; i < 3; i++) - border_color[i] = - util_format_linear_to_srgb_float(border_color[i]); - } - - /* Turn the border color into the layout of channels that it would - * have when stored as texture contents. - */ - float storage_color[4]; - util_format_unswizzle_4f(storage_color, - border_color, - tex_format_desc->swizzle); - - /* Now, pack so that when the vc4_format-sampled texture contents are - * replaced with our border color, the vc4_get_format_swizzle() - * swizzling will get the right channels. - */ - if (util_format_is_depth_or_stencil(texture->format)) { - uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM, - sampler->border_color.f[0]) << 8; - } else { - switch (rsc->vc4_format) { - default: - case VC4_TEXTURE_TYPE_RGBA8888: - util_pack_color(storage_color, - PIPE_FORMAT_R8G8B8A8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_RGBA4444: - util_pack_color(storage_color, - PIPE_FORMAT_A8B8G8R8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_RGB565: - util_pack_color(storage_color, - PIPE_FORMAT_B8G8R8A8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_ALPHA: - uc.ui[0] = float_to_ubyte(storage_color[0]) << 24; - break; - case VC4_TEXTURE_TYPE_LUMALPHA: - uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) | - (float_to_ubyte(storage_color[0]) << 0)); - break; - } - } - - cl_aligned_u32(&vc4->uniforms, uc.ui[0]); -} - -static uint32_t -get_texrect_scale(struct vc4_texture_stateobj *texstate, - enum quniform_contents contents, - uint32_t data) -{ - struct pipe_sampler_view *texture = texstate->textures[data]; - uint32_t dim; - - if (contents == QUNIFORM_TEXRECT_SCALE_X) - dim = texture->texture->width0; - else - dim = texture->texture->height0; - - return fui(1.0f / dim); -} - -static struct vc4_bo * -vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader, - const uint32_t *gallium_uniforms) -{ - if (!shader->ubo_size) - return NULL; - - struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); - uint32_t *data = vc4_bo_map(ubo); - for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { - memcpy(data + shader->ubo_ranges[i].dst_offset, - gallium_uniforms + shader->ubo_ranges[i].src_offset, - shader->ubo_ranges[i].size); - } - - return ubo; -} - -void -vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, - struct vc4_constbuf_stateobj *cb, - struct vc4_texture_stateobj *texstate) -{ - struct vc4_shader_uniform_info *uinfo = &shader->uniforms; - const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; - struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); - - cl_ensure_space(&vc4->uniforms, (uinfo->count + - uinfo->num_texture_samples) * 4); - - cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); - - for (int i = 0; i < uinfo->count; i++) { - - switch (uinfo->contents[i]) { - case QUNIFORM_CONSTANT: - cl_aligned_u32(&vc4->uniforms, uinfo->data[i]); - break; - case QUNIFORM_UNIFORM: - cl_aligned_u32(&vc4->uniforms, - gallium_uniforms[uinfo->data[i]]); - break; - case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f); - break; - case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f); - break; - - case QUNIFORM_VIEWPORT_Z_OFFSET: - cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]); - break; - case QUNIFORM_VIEWPORT_Z_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]); - break; - - case QUNIFORM_USER_CLIP_PLANE: - cl_aligned_f(&vc4->uniforms, - vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P0: - write_texture_p0(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P1: - write_texture_p1(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P2: - write_texture_p2(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_UBO_ADDR: - cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0); - break; - - case QUNIFORM_TEXTURE_BORDER_COLOR: - write_texture_border_color(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXRECT_SCALE_X: - case QUNIFORM_TEXRECT_SCALE_Y: - cl_aligned_u32(&vc4->uniforms, - get_texrect_scale(texstate, - uinfo->contents[i], - uinfo->data[i])); - break; - - case QUNIFORM_BLEND_CONST_COLOR: - cl_aligned_f(&vc4->uniforms, - CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1)); - break; - - case QUNIFORM_STENCIL: - cl_aligned_u32(&vc4->uniforms, - vc4->zsa->stencil_uniforms[uinfo->data[i]] | - (uinfo->data[i] <= 1 ? - (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : - 0)); - break; - - case QUNIFORM_ALPHA_REF: - cl_aligned_f(&vc4->uniforms, - vc4->zsa->base.alpha.ref_value); - break; - } -#if 0 - uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4); - fprintf(stderr, "%p: %d / 0x%08x (%f)\n", - shader, i, written_val, uif(written_val)); -#endif - } -} - static void vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) { diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 1c96ef4795f..254140a72f5 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TEX_B] = { "tex_b", 0, 2 }, [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, - [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 }, - [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 }, - [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 }, - [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 }, [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 }, [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 }, [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 }, @@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst) } } -bool -qir_reads_r4(struct qinst *inst) -{ - switch (inst->op) { - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - return true; - default: - return false; - } -} - static void qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) { diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 732cfd0b306..cade795c12a 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -36,6 +36,11 @@ #include "util/list.h" #include "util/u_math.h" +#include "vc4_screen.h" +#include "pipe/p_state.h" + +struct nir_builder; + enum qfile { QFILE_NULL, QFILE_TEMP, @@ -155,10 +160,6 @@ enum qop { * the destination */ QOP_TEX_RESULT, - QOP_R4_UNPACK_A, - QOP_R4_UNPACK_B, - QOP_R4_UNPACK_C, - QOP_R4_UNPACK_D }; struct queued_qpu_inst { @@ -243,7 +244,11 @@ enum quniform_contents { QUNIFORM_TEXTURE_BORDER_COLOR, - QUNIFORM_BLEND_CONST_COLOR, + QUNIFORM_BLEND_CONST_COLOR_X, + QUNIFORM_BLEND_CONST_COLOR_Y, + QUNIFORM_BLEND_CONST_COLOR_Z, + QUNIFORM_BLEND_CONST_COLOR_W, + QUNIFORM_STENCIL, QUNIFORM_ALPHA_REF, @@ -280,6 +285,52 @@ struct vc4_compiler_ubo_range { bool used; }; +struct vc4_key { + struct vc4_uncompiled_shader *shader_state; + struct { + enum pipe_format format; + unsigned compare_mode:1; + unsigned compare_func:3; + unsigned wrap_s:3; + unsigned wrap_t:3; + uint8_t swizzle[4]; + } tex[VC4_MAX_TEXTURE_SAMPLERS]; + uint8_t ucp_enables; +}; + +struct vc4_fs_key { + struct vc4_key base; + enum pipe_format color_format; + bool depth_enabled; + bool stencil_enabled; + bool stencil_twoside; + bool stencil_full_writemasks; + bool is_points; + bool is_lines; + bool alpha_test; + bool point_coord_upper_left; + bool light_twoside; + uint8_t alpha_test_func; + uint8_t logicop_func; + uint32_t point_sprite_mask; + + struct pipe_rt_blend_state blend; +}; + +struct vc4_vs_key { + struct vc4_key base; + + /** + * This is a proxy for the array of FS input semantics, which is + * larger than we would want to put in the key. + */ + uint64_t compiled_fs_id; + + enum pipe_format attr_formats[8]; + bool is_coord; + bool per_vertex_point_size; +}; + struct vc4_compile { struct vc4_context *vc4; nir_shader *s; @@ -369,6 +420,16 @@ struct vc4_compile { uint32_t variant_id; }; +/* Special nir_load_input intrinsic index for loading the current TLB + * destination color. + */ +#define VC4_NIR_TLB_COLOR_READ_INPUT 2000000000 + +/* Special offset for nir_load_uniform values to get a QUNIFORM_* + * state-dependent value. + */ +#define VC4_NIR_STATE_UNIFORM_OFFSET 2000000000 + struct vc4_compile *qir_compile_init(void); void qir_compile_destroy(struct vc4_compile *c); struct qinst *qir_inst(enum qop op, struct qreg dst, @@ -393,7 +454,6 @@ bool qir_is_multi_instruction(struct qinst *inst); bool qir_is_tex(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); -bool qir_reads_r4(struct qinst *inst); bool qir_src_needs_a_file(struct qinst *inst); struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg); @@ -409,6 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); bool qir_opt_vpm_writes(struct vc4_compile *c); +void vc4_nir_lower_blend(struct vc4_compile *c); +void vc4_nir_lower_io(struct vc4_compile *c); +nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, + enum quniform_contents contents); +nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b, + nir_ssa_def **srcs, int swiz); void qir_lower_uniforms(struct vc4_compile *c); void qpu_schedule_instructions(struct vc4_compile *c); @@ -523,27 +589,12 @@ QIR_ALU0(FRAG_W) QIR_ALU0(FRAG_REV_FLAG) QIR_ALU0(TEX_RESULT) QIR_ALU0(TLB_COLOR_READ) +QIR_NODST_1(TLB_COLOR_WRITE) QIR_NODST_1(TLB_Z_WRITE) QIR_NODST_1(TLB_DISCARD_SETUP) QIR_NODST_1(TLB_STENCIL_SETUP) static inline struct qreg -qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef)); - return t; -} - -static inline struct qreg -qir_SEL_X_0_COND(struct vc4_compile *c, int i) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef)); - return t; -} - -static inline struct qreg qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i) { struct qreg t = qir_get_temp(c); diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index 910c89dca79..f087c3b81b5 100644 --- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -52,7 +52,7 @@ static void add_uniform(struct hash_table *ht, struct qreg reg) { struct hash_entry *entry; - void *key = (void *)(uintptr_t)reg.index; + void *key = (void *)(uintptr_t)(reg.index + 1); entry = _mesa_hash_table_search(ht, key); if (entry) { @@ -66,7 +66,7 @@ static void remove_uniform(struct hash_table *ht, struct qreg reg) { struct hash_entry *entry; - void *key = (void *)(uintptr_t)reg.index; + void *key = (void *)(uintptr_t)(reg.index + 1); entry = _mesa_hash_table_search(ht, key); assert(entry); @@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c) struct hash_entry *entry; hash_table_foreach(ht, entry) { uint32_t count = (uintptr_t)entry->data; - uint32_t index = (uintptr_t)entry->key; + uint32_t index = (uintptr_t)entry->key - 1; if (count > max_count) { max_count = count; max_index = index; diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index c9ab6344589..fbb90ba12a0 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); } static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); } static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); } -uint64_t qpu_NOP(void); -uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src); -uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src); +uint64_t qpu_NOP(void) ATTRIBUTE_CONST; +uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST; +uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST; uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst, - struct qpu_reg src0, struct qpu_reg src1); + struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST; uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst, - struct qpu_reg src0, struct qpu_reg src1); -uint64_t qpu_merge_inst(uint64_t a, uint64_t b); -uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val); -uint64_t qpu_set_sig(uint64_t inst, uint32_t sig); -uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond); -uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond); -uint32_t qpu_encode_small_immediate(uint32_t i); - -bool qpu_waddr_is_tlb(uint32_t waddr); -bool qpu_inst_is_tlb(uint64_t inst); -int qpu_num_sf_accesses(uint64_t inst); + struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST; +uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST; +uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST; +uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST; +uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; +uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; +uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST; + +bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST; +bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST; +int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST; void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst); static inline uint64_t diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index 55e0e6139b5..00aeb300a9b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -225,7 +225,7 @@ static const char *qpu_condflags[] = { }; #define DESC(array, index) \ - ((index > ARRAY_SIZE(array) || !(array)[index]) ? \ + ((index >= ARRAY_SIZE(array) || !(array)[index]) ? \ "???" : (array)[index]) static const char * diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 99afe4b8798..f324056258c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QFILE_VPM: assert((int)qinst->src[i].index >= last_vpm_read_index); + (void)last_vpm_read_index; last_vpm_read_index = qinst->src[i].index; src[i] = qpu_ra(QPU_R_VPM); break; @@ -319,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) abort(); } - queue(c, qpu_a_MOV(dst, qpu_r4())); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; @@ -402,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_COLOR_LOAD); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_TLB_COLOR_WRITE: @@ -451,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_LOAD_TMU0); - - break; - - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - assert(src[0].mux == QPU_MUX_R4); - queue(c, qpu_a_MOV(dst, src[0])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + - (qinst->op - - QOP_R4_UNPACK_A), - QPU_UNPACK); - + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_UNPACK_8A_F: @@ -474,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_UNPACK_8D_F: case QOP_UNPACK_16A_F: case QOP_UNPACK_16B_F: { - assert(src[0].mux == QPU_MUX_A); - - /* Since we're setting the pack bits, if the - * destination is in A it would get re-packed. - */ - queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? - qpu_rb(31) : dst), - src[0], src[0])); - *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op - - QOP_UNPACK_8A_F], - QPU_UNPACK); + if (src[0].mux == QPU_MUX_R4) { + queue(c, qpu_a_MOV(dst, src[0])); + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + + (qinst->op - + QOP_UNPACK_8A_F), + QPU_UNPACK); + } else { + assert(src[0].mux == QPU_MUX_A); - if (dst.mux == QPU_MUX_A) { - queue(c, qpu_a_MOV(dst, qpu_rb(31))); + /* Since we're setting the pack bits, if the + * destination is in A it would get re-packed. + */ + queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? + qpu_rb(31) : dst), + src[0], src[0])); + *last_inst(c) |= + QPU_SET_FIELD(unpack_map[qinst->op - + QOP_UNPACK_8A_F], + QPU_UNPACK); + + if (dst.mux == QPU_MUX_A) { + queue(c, qpu_a_MOV(dst, qpu_rb(31))); + } } } break; diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c index 8471edbf62c..9cf6841f41c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_validate.c +++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c @@ -23,6 +23,13 @@ #include "vc4_qpu.h" +#ifdef NDEBUG +/* Since most of our code is used in assert()s, don't warn about dead code. */ +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + static bool writes_reg(uint64_t inst, uint32_t w) { diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index 3b0b890b66a..a29db1f3abe 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4) vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs)); vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in * vc4_qpu_emit.c @@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4) /* R4 can't be written as a general purpose register. (it's * TMU_NOSWAP as a write address). */ - if (vc4_regs[i].mux == QPU_MUX_R4) + if (vc4_regs[i].mux == QPU_MUX_R4) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); continue; + } ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); } - vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); - for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) + for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) { ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + } ra_set_finalize(vc4->regs, NULL); } @@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b) return a->priority - b->priority; } +#define CLASS_BIT_A (1 << 0) +#define CLASS_BIT_B_OR_ACC (1 << 1) +#define CLASS_BIT_R4 (1 << 2) + /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. * @@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) uint32_t temp_to_node[c->num_temps]; uint32_t def[c->num_temps]; uint32_t use[c->num_temps]; + uint8_t class_bits[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); memset(def, 0, sizeof(def)); @@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, c->num_temps); - for (uint32_t i = 0; i < c->num_temps; i++) { - ra_set_node_class(g, i, vc4->reg_class_any); - } - /* Compute the live ranges so we can figure out interference. */ uint32_t ip = 0; @@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) temp_to_node[map[i].temp] = i; } - /* Figure out our register classes and preallocated registers*/ + /* Figure out our register classes and preallocated registers. We + * start with any temp being able to be in any file, then instructions + * incrementally remove bits that the temp definitely can't be in. + */ + memset(class_bits, + CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4, + sizeof(class_bits)); + + ip = 0; list_for_each_entry(struct qinst, inst, &c->instructions, link) { + if (qir_writes_r4(inst)) { + /* This instruction writes r4 (and optionally moves + * its result to a temp), so nothing else can be + * stored in r4 across it. + */ + for (int i = 0; i < c->num_temps; i++) { + if (def[i] < ip && use[i] > ip) + class_bits[i] &= ~CLASS_BIT_R4; + } + } else { + /* R4 can't be written as a general purpose + * register. (it's TMU_NOSWAP as a write address). + */ + if (inst->dst.file == QFILE_TEMP) + class_bits[inst->dst.index] &= ~CLASS_BIT_R4; + } + switch (inst->op) { case QOP_FRAG_Z: ra_set_node_reg(g, temp_to_node[inst->dst.index], @@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); break; - case QOP_TEX_RESULT: - case QOP_TLB_COLOR_READ: - assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4); - ra_set_node_reg(g, temp_to_node[inst->dst.index], - ACC_INDEX + 4); - break; - case QOP_PACK_SCALED: /* The pack flags require an A-file dst register. */ - ra_set_node_class(g, temp_to_node[inst->dst.index], - vc4->reg_class_a); + class_bits[inst->dst.index] &= CLASS_BIT_A; break; default: @@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } if (qir_src_needs_a_file(inst)) { - ra_set_node_class(g, temp_to_node[inst->src[0].index], - vc4->reg_class_a); + class_bits[inst->src[0].index] &= CLASS_BIT_A; + } + ip++; + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + int node = temp_to_node[i]; + + switch (class_bits[i]) { + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4: + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: + ra_set_node_class(g, node, vc4->reg_class_any); + break; + case CLASS_BIT_A | CLASS_BIT_R4: + ra_set_node_class(g, node, vc4->reg_class_r4_or_a); + break; + case CLASS_BIT_A: + ra_set_node_class(g, node, vc4->reg_class_a); + break; + default: + fprintf(stderr, "temp %d: bad class bits: 0x%x\n", + i, class_bits[i]); + abort(); + break; } } @@ -270,7 +315,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } bool ok = ra_allocate(g); - assert(ok); + if (!ok) { + fprintf(stderr, "Failed to register allocate:\n"); + qir_dump(c); + abort(); + } for (uint32_t i = 0; i < c->num_temps; i++) { temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])]; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index cab76406055..5d5166fd818 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx, if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { vc4_resource_bo_alloc(rsc); + + /* If it might be bound as one of our vertex buffers, make + * sure we re-emit vertex buffer state. + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (vc4_cl_references_bo(pctx, rsc->bo)) { if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && @@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, prsc->height0 == box->height && prsc->depth0 == box->depth) { vc4_resource_bo_alloc(rsc); + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else { vc4_flush(pctx); } diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h index ab8f5d3cd55..87571b75e8b 100644 --- a/src/gallium/drivers/vc4/vc4_resource.h +++ b/src/gallium/drivers/vc4/vc4_resource.h @@ -82,19 +82,19 @@ struct vc4_resource { struct pipe_resource *shadow_parent; }; -static INLINE struct vc4_resource * +static inline struct vc4_resource * vc4_resource(struct pipe_resource *prsc) { return (struct vc4_resource *)prsc; } -static INLINE struct vc4_surface * +static inline struct vc4_surface * vc4_surface(struct pipe_surface *psurf) { return (struct vc4_surface *)psurf; } -static INLINE struct vc4_transfer * +static inline struct vc4_transfer * vc4_transfer(struct pipe_transfer *ptrans) { return (struct vc4_transfer *)ptrans; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index f63bead0fbb..2dee1d40e5f 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -176,6 +176,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; /* Stream output. */ @@ -489,6 +493,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen, { whandle->stride = stride; + /* If we're passing some reference to our BO out to some other part of + * the system, then we can't do any optimizations about only us being + * the ones seeing it (like BO caching or shadow update avoidance). + */ + bo->private = false; + switch (whandle->type) { case DRM_API_HANDLE_TYPE_SHARED: return vc4_bo_flink(bo, &whandle->handle); diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index b58013dd2ee..7cfd236349d 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -74,11 +74,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec) struct vc4_bo **bos = vc4->bo_pointers.base; exec->bo_count = args->bo_handle_count; - exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state)); + exec->bo = calloc(exec->bo_count, sizeof(void *)); for (int i = 0; i < exec->bo_count; i++) { struct vc4_bo *bo = bos[i]; struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo); + struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base); #if 0 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name); #endif @@ -86,7 +87,16 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec) vc4_bo_map(bo); memcpy(obj->vaddr, bo->map, bo->size); - exec->bo[i].bo = obj; + exec->bo[i] = obj; + + /* The kernel does this validation at shader create ioctl + * time. + */ + if (strcmp(bo->name, "code") == 0) { + drm_bo->validated_shader = vc4_validate_shader(obj); + if (!drm_bo->validated_shader) + abort(); + } } return 0; } @@ -95,7 +105,7 @@ static int vc4_simulator_unpin_bos(struct vc4_exec_info *exec) { for (int i = 0; i < exec->bo_count; i++) { - struct drm_gem_cma_object *obj = exec->bo[i].bo; + struct drm_gem_cma_object *obj = exec->bo[i]; struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo; memcpy(bo->map, obj->vaddr, bo->size); diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 2bb36b253bb..68ace0216aa 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -78,6 +78,7 @@ struct drm_gem_cma_object { struct drm_vc4_bo { struct drm_gem_cma_object base; struct vc4_bo *bo; + struct vc4_validated_shader_info *validated_shader; struct list_head unref_head; }; diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 4a1d4c3a4d6..8a759c2ca4c 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx, /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, * BCM21553). */ - so->point_size = MAX2(cso->point_size, .125); + so->point_size = MAX2(cso->point_size, .125f); if (cso->front_ccw) so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES; @@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader) } } +static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest) +{ + switch (p_wrap) { + case PIPE_TEX_WRAP_REPEAT: + return 0; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return 1; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return 2; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return 3; + case PIPE_TEX_WRAP_CLAMP: + return (using_nearest ? 1 : 3); + default: + fprintf(stderr, "Unknown wrap mode %d\n", p_wrap); + assert(!"not reached"); + return 0; + } +} + static void * vc4_create_sampler_state(struct pipe_context *pctx, const struct pipe_sampler_state *cso) { - return vc4_generic_cso_state_create(cso, sizeof(*cso)); + static const uint8_t minfilter_map[6] = { + VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR, + VC4_TEX_P1_MINFILT_LIN_MIP_NEAR, + VC4_TEX_P1_MINFILT_NEAR_MIP_LIN, + VC4_TEX_P1_MINFILT_LIN_MIP_LIN, + VC4_TEX_P1_MINFILT_NEAREST, + VC4_TEX_P1_MINFILT_LINEAR, + }; + static const uint32_t magfilter_map[] = { + [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST, + [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR, + }; + bool either_nearest = + (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST || + cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST); + struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state); + + if (!so) + return NULL; + + memcpy(so, cso, sizeof(*cso)); + + so->texture_p1 = + (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter], + VC4_TEX_P1_MAGFILT) | + VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 + + cso->min_img_filter], + VC4_TEX_P1_MINFILT) | + VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest), + VC4_TEX_P1_WRAP_S) | + VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest), + VC4_TEX_P1_WRAP_T)); + + return so; } static void @@ -499,13 +552,13 @@ static struct pipe_sampler_view * vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, const struct pipe_sampler_view *cso) { - struct pipe_sampler_view *so = malloc(sizeof(*so)); + struct vc4_sampler_view *so = malloc(sizeof(*so)); struct vc4_resource *rsc = vc4_resource(prsc); if (!so) return NULL; - *so = *cso; + so->base = *cso; pipe_reference(NULL, &prsc->reference); @@ -516,18 +569,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, * Also, Raspberry Pi doesn't support sampling from raster textures, * so we also have to copy to a temporary then. */ - if (so->u.tex.first_level || + if (cso->u.tex.first_level || rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) { struct vc4_resource *shadow_parent = vc4_resource(prsc); struct pipe_resource tmpl = shadow_parent->base.b; struct vc4_resource *clone; tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; - tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level); - tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level); - tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level; + tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level); + tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level); + tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level; prsc = vc4_resource_create(pctx->screen, &tmpl); + rsc = vc4_resource(prsc); clone = vc4_resource(prsc); clone->shadow_parent = &shadow_parent->base.b; /* Flag it as needing update of the contents from the parent. */ @@ -535,11 +589,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R); } - so->texture = prsc; - so->reference.count = 1; - so->context = pctx; - - return so; + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->texture_p0 = + (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | + VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) | + VC4_SET_FIELD(cso->u.tex.last_level - + cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) | + VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE, + VC4_TEX_P0_CMMODE)); + so->texture_p1 = + (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) | + VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) | + VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH)); + + return &so->base; } static void diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c index f9801c9cefd..cf86eb0fa31 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.c +++ b/src/gallium/drivers/vc4/vc4_tiling.c @@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp) static void check_box_utile_alignment(const struct pipe_box *box, int cpp) { - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - - assert(!(box->x & (utile_w - 1))); - assert(!(box->y & (utile_h - 1))); - assert(!(box->width & (utile_w - 1))); - assert(!(box->height & (utile_h - 1))); + assert(!(box->x & (vc4_utile_width(cpp) - 1))); + assert(!(box->y & (vc4_utile_height(cpp) - 1))); + assert(!(box->width & (vc4_utile_width(cpp) - 1))); + assert(!(box->height & (vc4_utile_height(cpp) - 1))); } static void diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h index b5d10da3417..b90bba70200 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.h +++ b/src/gallium/drivers/vc4/vc4_tiling.h @@ -24,9 +24,9 @@ #ifndef VC4_TILING_H #define VC4_TILING_H -uint32_t vc4_utile_width(int cpp); -uint32_t vc4_utile_height(int cpp); -bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp); +uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST; +uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST; +bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST; void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp); void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp); void vc4_load_tiled_image(void *dst, uint32_t dst_stride, diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c new file mode 100644 index 00000000000..85d6998205e --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -0,0 +1,344 @@ +/* + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/u_pack_color.h" +#include "util/format_srgb.h" + +#include "vc4_context.h" +#include "vc4_qir.h" + +static void +write_texture_p0(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct vc4_sampler_view *sview = + vc4_sampler_view(texstate->textures[unit]); + struct vc4_resource *rsc = vc4_resource(sview->base.texture); + + cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0); +} + +static void +write_texture_p1(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct vc4_sampler_view *sview = + vc4_sampler_view(texstate->textures[unit]); + struct vc4_sampler_state *sampler = + vc4_sampler_state(texstate->samplers[unit]); + + cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1); +} + +static void +write_texture_p2(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t data) +{ + uint32_t unit = data & 0xffff; + struct pipe_sampler_view *texture = texstate->textures[unit]; + struct vc4_resource *rsc = vc4_resource(texture->texture); + + cl_aligned_u32(uniforms, + VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE, + VC4_TEX_P2_PTYPE) | + VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) | + VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD)); +} + + +#define SWIZ(x,y,z,w) { \ + UTIL_FORMAT_SWIZZLE_##x, \ + UTIL_FORMAT_SWIZZLE_##y, \ + UTIL_FORMAT_SWIZZLE_##z, \ + UTIL_FORMAT_SWIZZLE_##w \ +} + +static void +write_texture_border_color(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct pipe_sampler_state *sampler = texstate->samplers[unit]; + struct pipe_sampler_view *texture = texstate->textures[unit]; + struct vc4_resource *rsc = vc4_resource(texture->texture); + union util_color uc; + + const struct util_format_description *tex_format_desc = + util_format_description(texture->format); + + float border_color[4]; + for (int i = 0; i < 4; i++) + border_color[i] = sampler->border_color.f[i]; + if (util_format_is_srgb(texture->format)) { + for (int i = 0; i < 3; i++) + border_color[i] = + util_format_linear_to_srgb_float(border_color[i]); + } + + /* Turn the border color into the layout of channels that it would + * have when stored as texture contents. + */ + float storage_color[4]; + util_format_unswizzle_4f(storage_color, + border_color, + tex_format_desc->swizzle); + + /* Now, pack so that when the vc4_format-sampled texture contents are + * replaced with our border color, the vc4_get_format_swizzle() + * swizzling will get the right channels. + */ + if (util_format_is_depth_or_stencil(texture->format)) { + uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM, + sampler->border_color.f[0]) << 8; + } else { + switch (rsc->vc4_format) { + default: + case VC4_TEXTURE_TYPE_RGBA8888: + util_pack_color(storage_color, + PIPE_FORMAT_R8G8B8A8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_RGBA4444: + util_pack_color(storage_color, + PIPE_FORMAT_A8B8G8R8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_RGB565: + util_pack_color(storage_color, + PIPE_FORMAT_B8G8R8A8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_ALPHA: + uc.ui[0] = float_to_ubyte(storage_color[0]) << 24; + break; + case VC4_TEXTURE_TYPE_LUMALPHA: + uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) | + (float_to_ubyte(storage_color[0]) << 0)); + break; + } + } + + cl_aligned_u32(uniforms, uc.ui[0]); +} + +static uint32_t +get_texrect_scale(struct vc4_texture_stateobj *texstate, + enum quniform_contents contents, + uint32_t data) +{ + struct pipe_sampler_view *texture = texstate->textures[data]; + uint32_t dim; + + if (contents == QUNIFORM_TEXRECT_SCALE_X) + dim = texture->texture->width0; + else + dim = texture->texture->height0; + + return fui(1.0f / dim); +} + +static struct vc4_bo * +vc4_upload_ubo(struct vc4_context *vc4, + struct vc4_compiled_shader *shader, + const uint32_t *gallium_uniforms) +{ + if (!shader->ubo_size) + return NULL; + + struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); + uint32_t *data = vc4_bo_map(ubo); + for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { + memcpy(data + shader->ubo_ranges[i].dst_offset, + gallium_uniforms + shader->ubo_ranges[i].src_offset, + shader->ubo_ranges[i].size); + } + + return ubo; +} + +void +vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, + struct vc4_constbuf_stateobj *cb, + struct vc4_texture_stateobj *texstate) +{ + struct vc4_shader_uniform_info *uinfo = &shader->uniforms; + const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; + struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); + + cl_ensure_space(&vc4->uniforms, (uinfo->count + + uinfo->num_texture_samples) * 4); + + struct vc4_cl_out *uniforms = + cl_start_shader_reloc(&vc4->uniforms, + uinfo->num_texture_samples); + + for (int i = 0; i < uinfo->count; i++) { + + switch (uinfo->contents[i]) { + case QUNIFORM_CONSTANT: + cl_aligned_u32(&uniforms, uinfo->data[i]); + break; + case QUNIFORM_UNIFORM: + cl_aligned_u32(&uniforms, + gallium_uniforms[uinfo->data[i]]); + break; + case QUNIFORM_VIEWPORT_X_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f); + break; + case QUNIFORM_VIEWPORT_Y_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f); + break; + + case QUNIFORM_VIEWPORT_Z_OFFSET: + cl_aligned_f(&uniforms, vc4->viewport.translate[2]); + break; + case QUNIFORM_VIEWPORT_Z_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[2]); + break; + + case QUNIFORM_USER_CLIP_PLANE: + cl_aligned_f(&uniforms, + vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P0: + write_texture_p0(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P1: + write_texture_p1(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P2: + write_texture_p2(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_UBO_ADDR: + cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0); + break; + + case QUNIFORM_TEXTURE_BORDER_COLOR: + write_texture_border_color(vc4, &uniforms, + texstate, uinfo->data[i]); + break; + + case QUNIFORM_TEXRECT_SCALE_X: + case QUNIFORM_TEXRECT_SCALE_Y: + cl_aligned_u32(&uniforms, + get_texrect_scale(texstate, + uinfo->contents[i], + uinfo->data[i])); + break; + + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: + cl_aligned_f(&uniforms, + CLAMP(vc4->blend_color.color[uinfo->contents[i] - + QUNIFORM_BLEND_CONST_COLOR_X], + 0, 1)); + break; + + case QUNIFORM_STENCIL: + cl_aligned_u32(&uniforms, + vc4->zsa->stencil_uniforms[uinfo->data[i]] | + (uinfo->data[i] <= 1 ? + (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : + 0)); + break; + + case QUNIFORM_ALPHA_REF: + cl_aligned_f(&uniforms, + vc4->zsa->base.alpha.ref_value); + break; + } +#if 0 + uint32_t written_val = *((uint32_t *)uniforms - 1); + fprintf(stderr, "%p: %d / 0x%08x (%f)\n", + shader, i, written_val, uif(written_val)); +#endif + } + + cl_end(&vc4->uniforms, uniforms); + + vc4_bo_unreference(&ubo); +} + +void +vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader) +{ + uint32_t dirty = 0; + + for (int i = 0; i < shader->uniforms.count; i++) { + switch (shader->uniforms.contents[i]) { + case QUNIFORM_CONSTANT: + break; + case QUNIFORM_UNIFORM: + case QUNIFORM_UBO_ADDR: + dirty |= VC4_DIRTY_CONSTBUF; + break; + + case QUNIFORM_VIEWPORT_X_SCALE: + case QUNIFORM_VIEWPORT_Y_SCALE: + case QUNIFORM_VIEWPORT_Z_OFFSET: + case QUNIFORM_VIEWPORT_Z_SCALE: + dirty |= VC4_DIRTY_VIEWPORT; + break; + + case QUNIFORM_USER_CLIP_PLANE: + dirty |= VC4_DIRTY_CLIP; + break; + + case QUNIFORM_TEXTURE_CONFIG_P0: + case QUNIFORM_TEXTURE_CONFIG_P1: + case QUNIFORM_TEXTURE_CONFIG_P2: + case QUNIFORM_TEXTURE_BORDER_COLOR: + case QUNIFORM_TEXRECT_SCALE_X: + case QUNIFORM_TEXRECT_SCALE_Y: + dirty |= VC4_DIRTY_TEXSTATE; + break; + + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: + dirty |= VC4_DIRTY_BLEND_COLOR; + break; + + case QUNIFORM_STENCIL: + case QUNIFORM_ALPHA_REF: + dirty |= VC4_DIRTY_ZSA; + break; + } + } + + shader->uniform_dirty_bits = dirty; +} |