diff options
author | Iago Toral Quiroga <[email protected]> | 2019-10-31 10:46:58 +0100 |
---|---|---|
committer | Iago Toral Quiroga <[email protected]> | 2019-12-16 08:42:37 +0100 |
commit | 45bc61add078b11e1809cf607e9c876329004d5d (patch) | |
tree | 5dec7af5cf27d063dbe0e3ab13e41649338fd25a /src/gallium/drivers/v3d | |
parent | 74a59fdc6e8c4f9c51454a6d1a5b8998083014b5 (diff) |
v3d: move layer rendering to a separate helper
This helps with reducing nesting level after adding the loop
to handle layered rendering.
Reviewed-by: Alejandro PiƱeiro <[email protected]>
Diffstat (limited to 'src/gallium/drivers/v3d')
-rw-r--r-- | src/gallium/drivers/v3d/v3dx_rcl.c | 234 |
1 files changed, 119 insertions, 115 deletions
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c index 792b7582dfc..0582e9abab0 100644 --- a/src/gallium/drivers/v3d/v3dx_rcl.c +++ b/src/gallium/drivers/v3d/v3dx_rcl.c @@ -503,6 +503,123 @@ v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf, #define div_round_up(a, b) (((a) + (b) - 1) / b) +static void +emit_render_layer(struct v3d_job *job, uint32_t layer) +{ + uint32_t supertile_w = 1, supertile_h = 1; + + /* If doing multicore binning, we would need to initialize each + * core's tile list here. + */ + uint32_t tile_alloc_offset = + layer * job->draw_tiles_x * job->draw_tiles_y * 64; + cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { + list.address = cl_address(job->tile_alloc, tile_alloc_offset); + } + + cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) { + uint32_t frame_w_in_supertiles, frame_h_in_supertiles; + const uint32_t max_supertiles = 256; + + /* Size up our supertiles until we get under the limit. */ + for (;;) { + frame_w_in_supertiles = div_round_up(job->draw_tiles_x, + supertile_w); + frame_h_in_supertiles = div_round_up(job->draw_tiles_y, + supertile_h); + if (frame_w_in_supertiles * + frame_h_in_supertiles < max_supertiles) { + break; + } + + if (supertile_w < supertile_h) + supertile_w++; + else + supertile_h++; + } + + config.number_of_bin_tile_lists = 1; + config.total_frame_width_in_tiles = job->draw_tiles_x; + config.total_frame_height_in_tiles = job->draw_tiles_y; + + config.supertile_width_in_tiles = supertile_w; + config.supertile_height_in_tiles = supertile_h; + + config.total_frame_width_in_supertiles = frame_w_in_supertiles; + config.total_frame_height_in_supertiles = frame_h_in_supertiles; + } + + /* Start by clearing the tile buffer. */ + cl_emit(&job->rcl, TILE_COORDINATES, coords) { + coords.tile_column_number = 0; + coords.tile_row_number = 0; + } + + /* Emit an initial clear of the tile buffers. This is necessary + * for any buffers that should be cleared (since clearing + * normally happens at the *end* of the generic tile list), but + * it's also nice to clear everything so the first tile doesn't + * inherit any contents from some previous frame. + * + * Also, implement the GFXH-1742 workaround. There's a race in + * the HW between the RCL updating the TLB's internal type/size + * and thespawning of the QPU instances using the TLB's current + * internal type/size. To make sure the QPUs get the right + * state, we need 1 dummy store in between internal type/size + * changes on V3D 3.x, and 2 dummy stores on 4.x. + */ +#if V3D_VERSION < 40 + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } +#else + for (int i = 0; i < 2; i++) { + if (i > 0) + cl_emit(&job->rcl, TILE_COORDINATES, coords); + cl_emit(&job->rcl, END_OF_LOADS, end); + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } + if (i == 0) { + cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = true; + clear.clear_all_render_targets = true; + } + } + cl_emit(&job->rcl, END_OF_TILE_MARKER, end); + } +#endif + + cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); + + v3d_rcl_emit_generic_per_tile_list(job, layer); + + /* XXX perf: We should expose GL_MESA_tile_raster_order to + * improve X11 performance, but we should use Morton order + * otherwise to improve cache locality. + */ + uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; + uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; + uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels; + uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels; + + uint32_t max_x_supertile = 0; + uint32_t max_y_supertile = 0; + if (job->draw_max_x != 0 && job->draw_max_y != 0) { + max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels; + max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels; + } + + for (int y = min_y_supertile; y <= max_y_supertile; y++) { + for (int x = min_x_supertile; x <= max_x_supertile; x++) { + cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) { + coords.column_number_in_supertiles = x; + coords.row_number_in_supertiles = y; + } + } + } +} + void v3dX(emit_rcl)(struct v3d_job *job) { @@ -700,121 +817,8 @@ v3dX(emit_rcl)(struct v3d_job *job) * of the loop. */ assert(job->num_layers > 0 || (job->load == 0 && job->store == 0)); - for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) { - uint32_t supertile_w = 1, supertile_h = 1; - - /* If doing multicore binning, we would need to initialize each core's - * tile list here. - */ - uint32_t tile_alloc_offset = - layer * job->draw_tiles_x * job->draw_tiles_y * 64; - cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { - list.address = - cl_address(job->tile_alloc, tile_alloc_offset); - } - - cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) { - uint32_t frame_w_in_supertiles, frame_h_in_supertiles; - const uint32_t max_supertiles = 256; - - /* Size up our supertiles until we get under the limit. */ - for (;;) { - frame_w_in_supertiles = div_round_up(job->draw_tiles_x, - supertile_w); - frame_h_in_supertiles = div_round_up(job->draw_tiles_y, - supertile_h); - if (frame_w_in_supertiles * frame_h_in_supertiles < - max_supertiles) { - break; - } - - if (supertile_w < supertile_h) - supertile_w++; - else - supertile_h++; - } - - config.number_of_bin_tile_lists = 1; - config.total_frame_width_in_tiles = job->draw_tiles_x; - config.total_frame_height_in_tiles = job->draw_tiles_y; - - config.supertile_width_in_tiles = supertile_w; - config.supertile_height_in_tiles = supertile_h; - - config.total_frame_width_in_supertiles = frame_w_in_supertiles; - config.total_frame_height_in_supertiles = frame_h_in_supertiles; - } - - /* Start by clearing the tile buffer. */ - cl_emit(&job->rcl, TILE_COORDINATES, coords) { - coords.tile_column_number = 0; - coords.tile_row_number = 0; - } - - /* Emit an initial clear of the tile buffers. This is necessary for - * any buffers that should be cleared (since clearing normally happens - * at the *end* of the generic tile list), but it's also nice to clear - * everything so the first tile doesn't inherit any contents from some - * previous frame. - * - * Also, implement the GFXH-1742 workaround. There's a race in the HW - * between the RCL updating the TLB's internal type/size and the - * spawning of the QPU instances using the TLB's current internal - * type/size. To make sure the QPUs get the right state,, we need 1 - * dummy store in between internal type/size changes on V3D 3.x, and 2 - * dummy stores on 4.x. - */ -#if V3D_VERSION < 40 - cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { - store.buffer_to_store = NONE; - } -#else - for (int i = 0; i < 2; i++) { - if (i > 0) - cl_emit(&job->rcl, TILE_COORDINATES, coords); - cl_emit(&job->rcl, END_OF_LOADS, end); - cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { - store.buffer_to_store = NONE; - } - if (i == 0) { - cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { - clear.clear_z_stencil_buffer = true; - clear.clear_all_render_targets = true; - } - } - cl_emit(&job->rcl, END_OF_TILE_MARKER, end); - } -#endif - - cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); - - v3d_rcl_emit_generic_per_tile_list(job, layer); - - /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11 - * performance, but we should use Morton order otherwise to improve - * cache locality. - */ - uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; - uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; - uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels; - uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels; - - uint32_t max_x_supertile = 0; - uint32_t max_y_supertile = 0; - if (job->draw_max_x != 0 && job->draw_max_y != 0) { - max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels; - max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels; - } - - for (int y = min_y_supertile; y <= max_y_supertile; y++) { - for (int x = min_x_supertile; x <= max_x_supertile; x++) { - cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) { - coords.column_number_in_supertiles = x; - coords.row_number_in_supertiles = y; - } - } - } - } + for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) + emit_render_layer(job, layer); cl_emit(&job->rcl, END_OF_RENDERING, end); } |