diff options
author | Jason Ekstrand <[email protected]> | 2016-04-07 10:52:28 -0700 |
---|---|---|
committer | Jason Ekstrand <[email protected]> | 2016-04-08 12:08:51 -0700 |
commit | f6f54a29ca9bc8c1a1a994ff4e3ee09772de78e4 (patch) | |
tree | 3046c826ac992a547ceffe1d6649b69d0e56a44f /src | |
parent | e5295b5fb442d433b22e3b6f4b5c28a0e48677cd (diff) |
i965/tiled_memcopy: Add aligned mem_copy parameters to the [de]tiling functions
Each of the [de]tiling functions has three mem_copy calls:
1) Left edge to tile boundary
2) Tile boundary to tile boundary in a loop
3) Tile boundary to right edge
Copies 2 and 3 start at a tile edge so the pointer to tiled memory is
guaranteed to be at least 16-byte aligned. Copy 1, on the other hand,
starts at some arbitrary place in the tile so it doesn't have any such
alignment guarantees.
Cc: "11.1 11.2" <[email protected]>
Reviewed-by: Matt Turner <[email protected]>
Reviewed-by: Roland Scheidegger <[email protected]>
Reviewed-by: Chad Versace <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 75 |
1 files changed, 43 insertions, 32 deletions
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c index 31354582964..0ffd3ffcab5 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c @@ -172,6 +172,12 @@ typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * Copy texture data from linear to X tile layout. * * \copydoc tile_copy_fn + * + * The mem_copy parameters allow the user to specify an alternative mem_copy + * function that, for instance, may do RGBA -> BGRA swizzling. The first + * function must handle any memory alignment while the second function must + * only handle 16-byte alignment in whichever side (source or destination) is + * tiled. */ static inline void linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, @@ -179,7 +185,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t src_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* The copy destination offset for each range copied is the sum of * an X offset 'x0' or 'xo' and a Y offset 'yo.' @@ -200,10 +207,10 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); src += src_pitch; } @@ -220,7 +227,8 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t src_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* Y tiles consist of columns that are 'ytile_span' wide (and the same height * as the tile). Thus the destination offset for (x,y) is the sum of: @@ -259,12 +267,12 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); src += src_pitch; } @@ -281,7 +289,8 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t dst_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* The copy destination offset for each range copied is the sum of * an X offset 'x0' or 'xo' and a Y offset 'yo.' @@ -302,10 +311,10 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); + mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); dst += dst_pitch; } @@ -322,7 +331,8 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t dst_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* Y tiles consist of columns that are 'ytile_span' wide (and the same height * as the tile). Thus the destination offset for (x,y) is the sum of: @@ -361,12 +371,12 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); + mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); dst += dst_pitch; } @@ -393,26 +403,27 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { if (mem_copy == memcpy) return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, src_pitch, swizzle_bit, memcpy); + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_dst) return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy_aligned_dst, rgba8_copy_aligned_dst); else unreachable("not reached"); } else { if (mem_copy == memcpy) return linear_to_xtiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); + dst, src, src_pitch, swizzle_bit, + memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_dst) return linear_to_xtiled(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy_aligned_dst, rgba8_copy_aligned_dst); else unreachable("not reached"); } linear_to_xtiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, mem_copy); + dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -435,26 +446,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { if (mem_copy == memcpy) return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, src_pitch, swizzle_bit, memcpy); + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_dst) return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy_aligned_dst, rgba8_copy_aligned_dst); else unreachable("not reached"); } else { if (mem_copy == memcpy) return linear_to_ytiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_dst) return linear_to_ytiled(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy_aligned_dst, rgba8_copy_aligned_dst); else unreachable("not reached"); } linear_to_ytiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, mem_copy); + dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -477,26 +488,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { if (mem_copy == memcpy) return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, dst_pitch, swizzle_bit, memcpy); + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_src) return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy_aligned_src, rgba8_copy_aligned_src); else unreachable("not reached"); } else { if (mem_copy == memcpy) return xtiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, memcpy); + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_src) return xtiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy_aligned_src, rgba8_copy_aligned_src); else unreachable("not reached"); } xtiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, mem_copy); + dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -519,26 +530,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { if (mem_copy == memcpy) return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, dst_pitch, swizzle_bit, memcpy); + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_src) return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy_aligned_src, rgba8_copy_aligned_src); else unreachable("not reached"); } else { if (mem_copy == memcpy) return ytiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, memcpy); + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); else if (mem_copy == rgba8_copy_aligned_src) return ytiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy_aligned_src, rgba8_copy_aligned_src); else unreachable("not reached"); } ytiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, mem_copy); + dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); } /** |