diff options
-rw-r--r-- | src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 152 | ||||
-rw-r--r-- | src/mesa/pipe/cell/spu/spu_vertex_shader.h | 2 |
2 files changed, 71 insertions, 83 deletions
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c index cbd389435e6..3bbf9b7be4f 100644 --- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c +++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c @@ -100,7 +100,7 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size) } -#define CVT_32_FLOAT(q) (*q) +#define CVT_32_FLOAT(q) (*(q)) static INLINE qword CVT_64_FLOAT(const qword *qw) @@ -242,85 +242,90 @@ CVT_32_SNORM(const qword *qw) * This is probably needed/dupliocated elsewhere, eg format * conversion, texture sampling etc. */ -#define FETCH_ATTRIB( NAME, SZ, CVT ) \ -static qword \ -fetch_##NAME(const qword *qw) \ -{ \ - qword expanded = CVT(qw); \ - return si_selb(expanded, (qword) defaults, SZ); \ +#define FETCH_ATTRIB( NAME, SZ, CVT, N ) \ +static void \ +fetch_##NAME(qword *out, const qword *in) \ +{ \ + qword tmp[4]; \ + \ + tmp[0] = si_selb(CVT(in + (0 * N)), (qword) defaults, SZ); \ + tmp[1] = si_selb(CVT(in + (1 * N)), (qword) defaults, SZ); \ + tmp[2] = si_selb(CVT(in + (2 * N)), (qword) defaults, SZ); \ + tmp[3] = si_selb(CVT(in + (3 * N)), (qword) defaults, SZ); \ + _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) tmp); \ } -FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT ) -FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT ) -FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT ) -FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT ) +FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT, 2 ) +FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT, 2 ) +FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT, 2 ) +FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT, 2 ) -FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT ) -FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT ) -FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT ) -FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT, 1 ) +FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT, 1 ) +FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT, 1 ) +FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT, 1 ) -FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED ) -FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED ) -FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED ) -FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED ) +FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 ) +FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED, 1 ) +FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED, 1 ) +FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED, 1 ) -FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED ) -FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED ) -FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED ) -FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 ) +FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED, 1 ) +FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED, 1 ) +FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED, 1 ) -FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM ) -FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM ) -FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM ) -FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM ) +FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 ) +FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM, 1 ) +FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM, 1 ) +FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM, 1 ) -FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM ) -FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM ) -FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM ) -FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM ) +FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 ) +FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM, 1 ) +FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM, 1 ) +FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM, 1 ) -FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED ) -FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED ) -FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED ) -FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED ) +FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 ) +FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED, 1 ) +FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED, 1 ) +FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED, 1 ) -FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED ) -FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED ) -FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED ) -FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED ) +FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 ) +FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED, 1 ) +FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED, 1 ) +FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED, 1 ) -FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM ) -FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM ) -FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM ) -FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM ) +FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 ) +FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM, 1 ) +FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM, 1 ) +FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM, 1 ) -FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM ) -FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM ) -FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM ) -FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM ) +FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 ) +FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM, 1 ) +FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM, 1 ) +FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM, 1 ) -FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED ) -FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED ) -FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED ) -FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED ) +FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED, 1 ) +FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED, 1 ) +FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED, 1 ) +FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED, 1 ) -FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED ) -FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED ) -FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED ) -FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED ) +FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED, 1 ) +FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED, 1 ) +FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED, 1 ) +FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED, 1 ) -FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM ) -FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM ) -FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM ) -FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM, 1 ) +FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM, 1 ) +FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM, 1 ) +FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM, 1 ) -FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM ) -FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM ) -FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM ) -FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM ) +FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM, 1 ) +FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM, 1 ) +FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM, 1 ) +FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM, 1 ) -FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM ) +FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM, 1 ) @@ -584,7 +589,6 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, unsigned idx; const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; const unsigned quads_per_entry = (bytes_per_entry + 15) / 16; - qword p[4]; qword in[2 * 4]; @@ -609,23 +613,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, /* Convert all 4 vertices to vectors of float. */ - idx = 0; - for (i = 0; i < 4; i++) { - p[i] = (*fetch)(in + idx); - idx += quads_per_entry; - } - - - /* Transpose/swizzle into vector-friendly format. Currently - * assuming that all vertex shader inputs are float[4], but this - * isn't true -- if the vertex shader only wants tex0.xy, we - * could optimize for that. - * - * To do so fully without codegen would probably require an - * excessive number of fetch functions, but we could at least - * minimize the transpose step: - */ - _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p); + (*fetch)(&machine->Inputs[attr].xyzw[0].q, in); } } diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h index ea044e841da..8b37a239a47 100644 --- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h +++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h @@ -6,7 +6,7 @@ struct spu_vs_context; -typedef qword (*spu_fetch_func)(const qword *qw); +typedef void (*spu_fetch_func)(qword *out, const qword *in); typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw, struct spu_exec_machine *machine, const unsigned *elts, |