diff options
author | Chad Versace <[email protected]> | 2012-11-19 15:15:32 -0800 |
---|---|---|
committer | Chad Versace <[email protected]> | 2013-01-24 21:24:10 -0800 |
commit | b9f56ea923f97b0fef31c9430897e440ae20d03a (patch) | |
tree | 05af30bdcc780e0f2daedd45e4229fa8f3981a59 /src/glsl/lower_packing_builtins.cpp | |
parent | 9d7931ddf06bd41f1b1a589503cd028ff00d134e (diff) |
glsl: Add lowering pass for GLSL ES 3.00 pack/unpack operations (v4)
Lower them to arithmetic and bit manipulation expressions.
v2: Rewrite using ir_builder [for idr].
v3: Comment typos. [for mattst88]
v4: Fix arithmetic error in comments.
Factor out a shift instruction.
Don't heap allocate factory.instructions.
[for paul]
Reviewed-by: Ian Romanick <[email protected]> (v2)
Reviewed-by: Matt Tuner <[email protected]> (v3)
Reviewed-by: Paul Berry <[email protected]> (v4)
Signed-off-by: Chad Versace <[email protected]>
Diffstat (limited to 'src/glsl/lower_packing_builtins.cpp')
-rw-r--r-- | src/glsl/lower_packing_builtins.cpp | 1035 |
1 files changed, 1035 insertions, 0 deletions
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp new file mode 100644 index 00000000000..136d4cdfb32 --- /dev/null +++ b/src/glsl/lower_packing_builtins.cpp @@ -0,0 +1,1035 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "ir.h" +#include "ir_builder.h" +#include "ir_optimization.h" +#include "ir_rvalue_visitor.h" + +namespace { + +using namespace ir_builder; + +/** + * A visitor that lowers built-in floating-point pack/unpack expressions + * such packSnorm2x16. + */ +class lower_packing_builtins_visitor : public ir_rvalue_visitor { +public: + /** + * \param op_mask is a bitmask of `enum lower_packing_builtins_op` + */ + explicit lower_packing_builtins_visitor(int op_mask) + : op_mask(op_mask), + progress(false) + { + /* Mutually exclusive options. */ + assert(!((op_mask & LOWER_PACK_HALF_2x16) && + (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT))); + + assert(!((op_mask & LOWER_UNPACK_HALF_2x16) && + (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT))); + + factory.instructions = &factory_instructions; + } + + virtual ~lower_packing_builtins_visitor() + { + assert(factory_instructions.is_empty()); + } + + bool get_progress() { return progress; } + + void handle_rvalue(ir_rvalue **rvalue) + { + if (!*rvalue) + return; + + ir_expression *expr = (*rvalue)->as_expression(); + if (!expr) + return; + + enum lower_packing_builtins_op lowering_op = + choose_lowering_op(expr->operation); + + if (lowering_op == LOWER_PACK_UNPACK_NONE) + return; + + setup_factory(ralloc_parent(expr)); + + ir_rvalue *op0 = expr->operands[0]; + ralloc_steal(factory.mem_ctx, op0); + + switch (lowering_op) { + case LOWER_PACK_SNORM_2x16: + *rvalue = lower_pack_snorm_2x16(op0); + break; + case LOWER_PACK_UNORM_2x16: + *rvalue = lower_pack_unorm_2x16(op0); + break; + case LOWER_PACK_HALF_2x16: + *rvalue = lower_pack_half_2x16(op0); + break; + case LOWER_PACK_HALF_2x16_TO_SPLIT: + *rvalue = split_pack_half_2x16(op0); + break; + case LOWER_UNPACK_SNORM_2x16: + *rvalue = lower_unpack_snorm_2x16(op0); + break; + case LOWER_UNPACK_UNORM_2x16: + *rvalue = lower_unpack_unorm_2x16(op0); + break; + case LOWER_UNPACK_HALF_2x16: + *rvalue = lower_unpack_half_2x16(op0); + break; + case LOWER_UNPACK_HALF_2x16_TO_SPLIT: + *rvalue = split_unpack_half_2x16(op0); + break; + case LOWER_PACK_UNPACK_NONE: + assert(!"not reached"); + break; + } + + teardown_factory(); + progress = true; + } + +private: + const int op_mask; + bool progress; + ir_factory factory; + exec_list factory_instructions; + + /** + * Determine the needed lowering operation by filtering \a expr_op + * through \ref op_mask. + */ + enum lower_packing_builtins_op + choose_lowering_op(ir_expression_operation expr_op) + { + /* C++ regards int and enum as fundamentally different types. + * So, we can't simply return from each case; we must cast the return + * value. + */ + int result; + + switch (expr_op) { + case ir_unop_pack_snorm_2x16: + result = op_mask & LOWER_PACK_SNORM_2x16; + break; + case ir_unop_pack_unorm_2x16: + result = op_mask & LOWER_PACK_UNORM_2x16; + break; + case ir_unop_pack_half_2x16: + result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT); + break; + case ir_unop_unpack_snorm_2x16: + result = op_mask & LOWER_UNPACK_SNORM_2x16; + break; + case ir_unop_unpack_unorm_2x16: + result = op_mask & LOWER_UNPACK_UNORM_2x16; + break; + case ir_unop_unpack_half_2x16: + result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT); + break; + default: + result = LOWER_PACK_UNPACK_NONE; + break; + } + + return static_cast<enum lower_packing_builtins_op>(result); + } + + void + setup_factory(void *mem_ctx) + { + assert(factory.mem_ctx == NULL); + assert(factory.instructions->is_empty()); + + factory.mem_ctx = mem_ctx; + } + + void + teardown_factory() + { + base_ir->insert_before(factory.instructions); + assert(factory.instructions->is_empty()); + factory.mem_ctx = NULL; + } + + template <typename T> + ir_constant* + constant(T x) + { + return factory.constant(x); + } + + /** + * \brief Pack two uint16's into a single uint32. + * + * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32 + * where the least significant bits specify the first element of the pair. + * Return the uint32. + */ + ir_rvalue* + pack_uvec2_to_uint(ir_rvalue *uvec2_rval) + { + assert(uvec2_rval->type == glsl_type::uvec2_type); + + /* uvec2 u = UVEC2_RVAL; */ + ir_variable *u = factory.make_temp(glsl_type::uvec2_type, + "tmp_pack_uvec2_to_uint"); + factory.emit(assign(u, uvec2_rval)); + + /* return (u.y << 16) | (u.x & 0xffff); */ + return bit_or(lshift(swizzle_y(u), constant(16u)), + bit_and(swizzle_x(u), constant(0xffffu))); + } + + /** + * \brief Unpack a uint32 into two uint16's. + * + * Interpret the given uint32 as a uint16 pair where the uint32's least + * significant bits specify the pair's first element. Return the uint16 + * pair as a uvec2. + */ + ir_rvalue* + unpack_uint_to_uvec2(ir_rvalue *uint_rval) + { + assert(uint_rval->type == glsl_type::uint_type); + + /* uint u = UINT_RVAL; */ + ir_variable *u = factory.make_temp(glsl_type::uint_type, + "tmp_unpack_uint_to_uvec2_u"); + factory.emit(assign(u, uint_rval)); + + /* uvec2 u2; */ + ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type, + "tmp_unpack_uint_to_uvec2_u2"); + + /* u2.x = u & 0xffffu; */ + factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X)); + + /* u2.y = u >> 16u; */ + factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y)); + + return deref(u2).val; + } + + /** + * \brief Lower a packSnorm2x16 expression. + * + * \param vec2_rval is packSnorm2x16's input + * \return packSnorm2x16's output as a uint rvalue + */ + ir_rvalue* + lower_pack_snorm_2x16(ir_rvalue *vec2_rval) + { + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: + * + * highp uint packSnorm2x16(vec2 v) + * -------------------------------- + * First, converts each component of the normalized floating-point value + * v into 16-bit integer values. Then, the results are packed into the + * returned 32-bit unsigned integer. + * + * The conversion for component c of v to fixed point is done as + * follows: + * + * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0) + * + * The first component of the vector will be written to the least + * significant bits of the output; the last component will be written to + * the most significant bits. + * + * This function generates IR that approximates the following pseudo-GLSL: + * + * return pack_uvec2_to_uint( + * uvec2(ivec2( + * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f)))); + * + * It is necessary to first convert the vec2 to ivec2 rather than directly + * converting vec2 to uvec2 because the latter conversion is undefined. + * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to + * convert a negative floating point value to an uint". + */ + assert(vec2_rval->type == glsl_type::vec2_type); + + ir_rvalue *result = pack_uvec2_to_uint( + i2u(f2i(round_even(mul(clamp(vec2_rval, + constant(-1.0f), + constant(1.0f)), + constant(32767.0f)))))); + + assert(result->type == glsl_type::uint_type); + return result; + } + + /** + * \brief Lower an unpackSnorm2x16 expression. + * + * \param uint_rval is unpackSnorm2x16's input + * \return unpackSnorm2x16's output as a vec2 rvalue + */ + ir_rvalue* + lower_unpack_snorm_2x16(ir_rvalue *uint_rval) + { + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: + * + * highp vec2 unpackSnorm2x16 (highp uint p) + * ----------------------------------------- + * First, unpacks a single 32-bit unsigned integer p into a pair of + * 16-bit unsigned integers. Then, each component is converted to + * a normalized floating-point value to generate the returned + * two-component vector. + * + * The conversion for unpacked fixed-point value f to floating point is + * done as follows: + * + * unpackSnorm2x16: clamp(f / 32767.0, -1,+1) + * + * The first component of the returned vector will be extracted from the + * least significant bits of the input; the last component will be + * extracted from the most significant bits. + * + * This function generates IR that approximates the following pseudo-GLSL: + * + * return clamp( + * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f, + * -1.0f, 1.0f); + * + * The above IR may appear unnecessarily complex, but the intermediate + * conversion to ivec2 and the bit shifts are necessary to correctly unpack + * negative floats. + * + * To see why, consider packing and then unpacking vec2(-1.0, 0.0). + * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we + * place that int16 into an int32, which results in the *positive* integer + * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather + * unimportant bit 16. We must now extend the int16's sign bit into bits + * 17-32, which is accomplished by left-shifting then right-shifting. + */ + + assert(uint_rval->type == glsl_type::uint_type); + + ir_rvalue *result = + clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), + constant(16)), + constant(16u))), + constant(32767.0f)), + constant(-1.0f), + constant(1.0f)); + + assert(result->type == glsl_type::vec2_type); + return result; + } + + /** + * \brief Lower a packUnorm2x16 expression. + * + * \param vec2_rval is packUnorm2x16's input + * \return packUnorm2x16's output as a uint rvalue + */ + ir_rvalue* + lower_pack_unorm_2x16(ir_rvalue *vec2_rval) + { + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: + * + * highp uint packUnorm2x16 (vec2 v) + * --------------------------------- + * First, converts each component of the normalized floating-point value + * v into 16-bit integer values. Then, the results are packed into the + * returned 32-bit unsigned integer. + * + * The conversion for component c of v to fixed point is done as + * follows: + * + * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0) + * + * The first component of the vector will be written to the least + * significant bits of the output; the last component will be written to + * the most significant bits. + * + * This function generates IR that approximates the following pseudo-GLSL: + * + * return pack_uvec2_to_uint(uvec2( + * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f))); + * + * Here it is safe to directly convert the vec2 to uvec2 because the the + * vec2 has been clamped to a non-negative range. + */ + + assert(vec2_rval->type == glsl_type::vec2_type); + + ir_rvalue *result = pack_uvec2_to_uint( + f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f))))); + + assert(result->type == glsl_type::uint_type); + return result; + } + + /** + * \brief Lower an unpackUnorm2x16 expression. + * + * \param uint_rval is unpackUnorm2x16's input + * \return unpackUnorm2x16's output as a vec2 rvalue + */ + ir_rvalue* + lower_unpack_unorm_2x16(ir_rvalue *uint_rval) + { + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: + * + * highp vec2 unpackUnorm2x16 (highp uint p) + * ----------------------------------------- + * First, unpacks a single 32-bit unsigned integer p into a pair of + * 16-bit unsigned integers. Then, each component is converted to + * a normalized floating-point value to generate the returned + * two-component vector. + * + * The conversion for unpacked fixed-point value f to floating point is + * done as follows: + * + * unpackUnorm2x16: f / 65535.0 + * + * The first component of the returned vector will be extracted from the + * least significant bits of the input; the last component will be + * extracted from the most significant bits. + * + * This function generates IR that approximates the following pseudo-GLSL: + * + * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0; + */ + + assert(uint_rval->type == glsl_type::uint_type); + + ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)), + constant(65535.0f)); + + assert(result->type == glsl_type::vec2_type); + return result; + } + + /** + * \brief Lower the component-wise calculation of packHalf2x16. + * + * \param f_rval is one component of packHafl2x16's input + * \param e_rval is the unshifted exponent bits of f_rval + * \param m_rval is the unshifted mantissa bits of f_rval + * + * \return a uint rvalue that encodes a float16 in its lower 16 bits + */ + ir_rvalue* + pack_half_1x16_nosign(ir_rvalue *f_rval, + ir_rvalue *e_rval, + ir_rvalue *m_rval) + { + assert(e_rval->type == glsl_type::uint_type); + assert(m_rval->type == glsl_type::uint_type); + + /* uint u16; */ + ir_variable *u16 = factory.make_temp(glsl_type::uint_type, + "tmp_pack_half_1x16_u16"); + + /* float f = FLOAT_RVAL; */ + ir_variable *f = factory.make_temp(glsl_type::float_type, + "tmp_pack_half_1x16_f"); + factory.emit(assign(f, f_rval)); + + /* uint e = E_RVAL; */ + ir_variable *e = factory.make_temp(glsl_type::uint_type, + "tmp_pack_half_1x16_e"); + factory.emit(assign(e, e_rval)); + + /* uint m = M_RVAL; */ + ir_variable *m = factory.make_temp(glsl_type::uint_type, + "tmp_pack_half_1x16_m"); + factory.emit(assign(m, m_rval)); + + /* Preliminaries + * ------------- + * + * For a float16, the bit layout is: + * + * sign: 15 + * exponent: 10:14 + * mantissa: 0:9 + * + * Let f16 be a float16 value. The sign, exponent, and mantissa + * determine its value thus: + * + * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1) + * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2) + * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3) + * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4) + * if e16 = 31 and m16 != 0, then NaN (5) + * + * where 0 <= m16 < 2^10. + * + * For a float32, the bit layout is: + * + * sign: 31 + * exponent: 23:30 + * mantissa: 0:22 + * + * Let f32 be a float32 value. The sign, exponent, and mantissa + * determine its value thus: + * + * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10) + * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11) + * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12) + * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13) + * if e32 = 255 and m32 != 0, then NaN (14) + * + * where 0 <= m32 < 2^23. + * + * The minimum and maximum normal float16 values are + * + * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20) + * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21) + * + * The step at max_norm16 is + * + * max_step16 = 2^5 (22) + * + * Observe that the float16 boundary values in equations 20-21 lie in the + * range of normal float32 values. + * + * + * Rounding Behavior + * ----------------- + * Not all float32 values can be exactly represented as a float16. We + * round all such intermediate float32 values to the nearest float16; if + * the float32 is exactly between to float16 values, we round to the one + * with an even mantissa. This rounding behavior has several benefits: + * + * - It has no sign bias. + * + * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's + * GPU ISA. + * + * - By reproducing the behavior of the GPU (at least on Intel hardware), + * compile-time evaluation of constant packHalf2x16 GLSL expressions will + * result in the same value as if the expression were executed on the + * GPU. + * + * Calculation + * ----------- + * Our task is to compute s16, e16, m16 given f32. Since this function + * ignores the sign bit, assume that s32 = s16 = 0. There are several + * cases consider. + */ + + factory.emit( + + /* Case 1) f32 is NaN + * + * The resultant f16 will also be NaN. + */ + + /* if (e32 == 255 && m32 != 0) { */ + if_tree(logic_and(equal(e, constant(0xffu << 23u)), + logic_not(equal(m, constant(0u)))), + + assign(u16, constant(0x7fffu)), + + /* Case 2) f32 lies in the range [0, min_norm16). + * + * The resultant float16 will be either zero, subnormal, or normal. + * + * Solving + * + * f32 = min_norm16 (30) + * + * gives + * + * e32 = 113 and m32 = 0 (31) + * + * Therefore this case occurs if and only if + * + * e32 < 113 (32) + */ + + /* } else if (e32 < 113) { */ + if_tree(less(e, constant(113u << 23u)), + + /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */ + assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f), + constant((float) (1 << 24)))))), + + /* Case 3) f32 lies in the range + * [min_norm16, max_norm16 + max_step16). + * + * The resultant float16 will be either normal or infinite. + * + * Solving + * + * f32 = max_norm16 + max_step16 (40) + * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41) + * = 2^16 (42) + * gives + * + * e32 = 143 and m32 = 0 (43) + * + * We already solved the boundary condition f32 = min_norm16 above + * in equation 31. Therefore this case occurs if and only if + * + * 113 <= e32 and e32 < 143 + */ + + /* } else if (e32 < 143) { */ + if_tree(less(e, constant(143u << 23u)), + + /* The addition below handles the case where the mantissa rounds + * up to 1024 and bumps the exponent. + * + * u16 = ((e - (112u << 23u)) >> 13u) + * + round_to_even((float(m) / (1u << 13u)); + */ + assign(u16, add(rshift(sub(e, constant(112u << 23u)), + constant(13u)), + f2u(round_even( + div(u2f(m), constant((float) (1 << 13))))))), + + /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf]. + * + * The resultant float16 will be infinite. + * + * The cases above caught all float32 values in the range + * [0, max_norm16 + max_step16), so this is the fall-through case. + */ + + /* } else { */ + + assign(u16, constant(31u << 10u)))))); + + /* } */ + + return deref(u16).val; + } + + /** + * \brief Lower a packHalf2x16 expression. + * + * \param vec2_rval is packHalf2x16's input + * \return packHalf2x16's output as a uint rvalue + */ + ir_rvalue* + lower_pack_half_2x16(ir_rvalue *vec2_rval) + { + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: + * + * highp uint packHalf2x16 (mediump vec2 v) + * ---------------------------------------- + * Returns an unsigned integer obtained by converting the components of + * a two-component floating-point vector to the 16-bit floating-point + * representation found in the OpenGL ES Specification, and then packing + * these two 16-bit integers into a 32-bit unsigned integer. + * + * The first vector component specifies the 16 least- significant bits + * of the result; the second component specifies the 16 most-significant + * bits. + */ + + assert(vec2_rval->type == glsl_type::vec2_type); + + /* vec2 f = VEC2_RVAL; */ + ir_variable *f = factory.make_temp(glsl_type::vec2_type, + "tmp_pack_half_2x16_f"); + factory.emit(assign(f, vec2_rval)); + + /* uvec2 f32 = bitcast_f2u(f); */ + ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type, + "tmp_pack_half_2x16_f32"); + factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f))); + + /* uvec2 f16; */ + ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type, + "tmp_pack_half_2x16_f16"); + + /* Get f32's unshifted exponent bits. + * + * uvec2 e = f32 & 0x7f800000u; + */ + ir_variable *e = factory.make_temp(glsl_type::uvec2_type, + "tmp_pack_half_2x16_e"); + factory.emit(assign(e, bit_and(f32, constant(0x7f800000u)))); + + /* Get f32's unshifted mantissa bits. + * + * uvec2 m = f32 & 0x007fffffu; + */ + ir_variable *m = factory.make_temp(glsl_type::uvec2_type, + "tmp_pack_half_2x16_m"); + factory.emit(assign(m, bit_and(f32, constant(0x007fffffu)))); + + /* Set f16's exponent and mantissa bits. + * + * f16.x = pack_half_1x16_nosign(e.x, m.x); + * f16.y = pack_half_1y16_nosign(e.y, m.y); + */ + factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f), + swizzle_x(e), + swizzle_x(m)), + WRITEMASK_X)); + factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f), + swizzle_y(e), + swizzle_y(m)), + WRITEMASK_Y)); + + /* Set f16's sign bits. + * + * f16 |= (f32 & (1u << 31u) >> 16u; + */ + factory.emit( + assign(f16, bit_or(f16, + rshift(bit_and(f32, constant(1u << 31u)), + constant(16u))))); + + + /* return (f16.y << 16u) | f16.x; */ + ir_rvalue *result = bit_or(lshift(swizzle_y(f16), + constant(16u)), + swizzle_x(f16)); + + assert(result->type == glsl_type::uint_type); + return result; + } + + /** + * \brief Split packHalf2x16's vec2 operand into two floats. + * + * \param vec2_rval is packHalf2x16's input + * \return a uint rvalue + * + * Some code generators, such as the i965 fragment shader, require that all + * vector expressions be lowered to a sequence of scalar expressions. + * However, packHalf2x16 cannot be scalarized by the same mechanism as + * a true vector operation because its input and output have a differing + * number of vector components. + * + * This method scalarizes packHalf2x16 by transforming it from an unary + * operation having vector input to a binary operation having scalar input. + * That is, it transforms + * + * packHalf2x16(VEC2_RVAL); + * + * into + * + * vec2 v = VEC2_RVAL; + * return packHalf2x16_split(v.x, v.y); + */ + ir_rvalue* + split_pack_half_2x16(ir_rvalue *vec2_rval) + { + assert(vec2_rval->type == glsl_type::vec2_type); + + ir_variable *v = factory.make_temp(glsl_type::vec2_type, + "tmp_split_pack_half_2x16_v"); + factory.emit(assign(v, vec2_rval)); + + return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v)); + } + + /** + * \brief Lower the component-wise calculation of unpackHalf2x16. + * + * Given a uint that encodes a float16 in its lower 16 bits, this function + * returns a uint that encodes a float32 with the same value. The sign bit + * of the float16 is ignored. + * + * \param e_rval is the unshifted exponent bits of a float16 + * \param m_rval is the unshifted mantissa bits of a float16 + * \param a uint rvalue that encodes a float32 + */ + ir_rvalue* + unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval) + { + assert(e_rval->type == glsl_type::uint_type); + assert(m_rval->type == glsl_type::uint_type); + + /* uint u32; */ + ir_variable *u32 = factory.make_temp(glsl_type::uint_type, + "tmp_unpack_half_1x16_u32"); + + /* uint e = E_RVAL; */ + ir_variable *e = factory.make_temp(glsl_type::uint_type, + "tmp_unpack_half_1x16_e"); + factory.emit(assign(e, e_rval)); + + /* uint m = M_RVAL; */ + ir_variable *m = factory.make_temp(glsl_type::uint_type, + "tmp_unpack_half_1x16_m"); + factory.emit(assign(m, m_rval)); + + /* Preliminaries + * ------------- + * + * For a float16, the bit layout is: + * + * sign: 15 + * exponent: 10:14 + * mantissa: 0:9 + * + * Let f16 be a float16 value. The sign, exponent, and mantissa + * determine its value thus: + * + * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1) + * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2) + * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3) + * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4) + * if e16 = 31 and m16 != 0, then NaN (5) + * + * where 0 <= m16 < 2^10. + * + * For a float32, the bit layout is: + * + * sign: 31 + * exponent: 23:30 + * mantissa: 0:22 + * + * Let f32 be a float32 value. The sign, exponent, and mantissa + * determine its value thus: + * + * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10) + * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11) + * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12) + * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13) + * if e32 = 255 and m32 != 0, then NaN (14) + * + * where 0 <= m32 < 2^23. + * + * Calculation + * ----------- + * Our task is to compute s32, e32, m32 given f16. Since this function + * ignores the sign bit, assume that s32 = s16 = 0. There are several + * cases consider. + */ + + factory.emit( + + /* Case 1) f16 is zero or subnormal. + * + * The simplest method of calcuating f32 in this case is + * + * f32 = f16 (20) + * = 2^(-14) * (m16 / 2^10) (21) + * = m16 / 2^(-24) (22) + */ + + /* if (e16 == 0) { */ + if_tree(equal(e, constant(0u)), + + /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */ + assign(u32, expr(ir_unop_bitcast_f2u, + div(u2f(m), constant((float)(1 << 24))))), + + /* Case 2) f16 is normal. + * + * The equation + * + * f32 = f16 (30) + * 2^(e32 - 127) * (1 + m32 / 2^23) = (31) + * 2^(e16 - 15) * (1 + m16 / 2^10) + * + * can be decomposed into two + * + * 2^(e32 - 127) = 2^(e16 - 15) (32) + * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33) + * + * which solve to + * + * e32 = e16 + 112 (34) + * m32 = m16 * 2^13 (35) + */ + + /* } else if (e16 < 31)) { */ + if_tree(less(e, constant(31u << 10u)), + + /* u32 = ((e + (112 << 10)) | m) << 13; + */ + assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m), + constant(13u))), + + + /* Case 3) f16 is infinite. */ + if_tree(equal(m, constant(0u)), + + assign(u32, constant(255u << 23u)), + + /* Case 4) f16 is NaN. */ + /* } else { */ + + assign(u32, constant(0x7fffffffu)))))); + + /* } */ + + return deref(u32).val; + } + + /** + * \brief Lower an unpackHalf2x16 expression. + * + * \param uint_rval is unpackHalf2x16's input + * \return unpackHalf2x16's output as a vec2 rvalue + */ + ir_rvalue* + lower_unpack_half_2x16(ir_rvalue *uint_rval) + { + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: + * + * mediump vec2 unpackHalf2x16 (highp uint v) + * ------------------------------------------ + * Returns a two-component floating-point vector with components + * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit + * values, interpreting those values as 16-bit floating-point numbers + * according to the OpenGL ES Specification, and converting them to + * 32-bit floating-point values. + * + * The first component of the vector is obtained from the + * 16 least-significant bits of v; the second component is obtained + * from the 16 most-significant bits of v. + */ + assert(uint_rval->type == glsl_type::uint_type); + + /* uint u = RVALUE; + * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16); + */ + ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type, + "tmp_unpack_half_2x16_f16"); + factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval))); + + /* uvec2 f32; */ + ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type, + "tmp_unpack_half_2x16_f32"); + + /* Get f16's unshifted exponent bits. + * + * uvec2 e = f16 & 0x7c00u; + */ + ir_variable *e = factory.make_temp(glsl_type::uvec2_type, + "tmp_unpack_half_2x16_e"); + factory.emit(assign(e, bit_and(f16, constant(0x7c00u)))); + + /* Get f16's unshifted mantissa bits. + * + * uvec2 m = f16 & 0x03ffu; + */ + ir_variable *m = factory.make_temp(glsl_type::uvec2_type, + "tmp_unpack_half_2x16_m"); + factory.emit(assign(m, bit_and(f16, constant(0x03ffu)))); + + /* Set f32's exponent and mantissa bits. + * + * f32.x = unpack_half_1x16_nosign(e.x, m.x); + * f32.y = unpack_half_1x16_nosign(e.y, m.y); + */ + factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e), + swizzle_x(m)), + WRITEMASK_X)); + factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e), + swizzle_y(m)), + WRITEMASK_Y)); + + /* Set f32's sign bit. + * + * f32 |= (f16 & 0x8000u) << 16u; + */ + factory.emit(assign(f32, bit_or(f32, + lshift(bit_and(f16, + constant(0x8000u)), + constant(16u))))); + + /* return bitcast_u2f(f32); */ + ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32); + assert(result->type == glsl_type::vec2_type); + return result; + } + + /** + * \brief Split unpackHalf2x16 into two operations. + * + * \param uint_rval is unpackHalf2x16's input + * \return a vec2 rvalue + * + * Some code generators, such as the i965 fragment shader, require that all + * vector expressions be lowered to a sequence of scalar expressions. + * However, unpackHalf2x16 cannot be scalarized by the same method as + * a true vector operation because the number of components of its input + * and output differ. + * + * This method scalarizes unpackHalf2x16 by transforming it from a single + * operation having vec2 output to a pair of operations each having float + * output. That is, it transforms + * + * unpackHalf2x16(UINT_RVAL) + * + * into + * + * uint u = UINT_RVAL; + * vec2 v; + * + * v.x = unpackHalf2x16_split_x(u); + * v.y = unpackHalf2x16_split_y(u); + * + * return v; + */ + ir_rvalue* + split_unpack_half_2x16(ir_rvalue *uint_rval) + { + assert(uint_rval->type == glsl_type::uint_type); + + /* uint u = uint_rval; */ + ir_variable *u = factory.make_temp(glsl_type::uint_type, + "tmp_split_unpack_half_2x16_u"); + factory.emit(assign(u, uint_rval)); + + /* vec2 v; */ + ir_variable *v = factory.make_temp(glsl_type::vec2_type, + "tmp_split_unpack_half_2x16_v"); + + /* v.x = unpack_half_2x16_split_x(u); */ + factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u), + WRITEMASK_X)); + + /* v.y = unpack_half_2x16_split_y(u); */ + factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u), + WRITEMASK_Y)); + + return deref(v).val; + } +}; + +} // namespace anonymous + +/** + * \brief Lower the builtin packing functions. + * + * \param op_mask is a bitmask of `enum lower_packing_builtins_op`. + */ +bool +lower_packing_builtins(exec_list *instructions, int op_mask) +{ + lower_packing_builtins_visitor v(op_mask); + visit_list_elements(&v, instructions, true); + return v.get_progress(); +} |