/* * Copyright © 2010 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /** * \file brw_wm_channel_expressions.cpp * * Breaks vector operations down into operations on each component. * * The 965 fragment shader receives 8 or 16 pixels at a time, so each * channel of a vector is laid out as 1 or 2 8-float registers. Each * ALU operation operates on one of those channel registers. As a * result, there is no value to the 965 fragment shader in tracking * "vector" expressions in the sense of GLSL fragment shaders, when * doing a channel at a time may help in constant folding, algebraic * simplification, and reducing the liveness of channel registers. * * The exception to the desire to break everything down to floats is * texturing. The texture sampler returns a writemasked masked * 4/8-register sequence containing the texture values. We don't want * to dispatch to the sampler separately for each channel we need, so * we do retain the vector types in that case. */ #include "main/core.h" #include "brw_wm.h" #include "glsl/ir.h" #include "glsl/ir_expression_flattening.h" #include "glsl/glsl_types.h" class ir_channel_expressions_visitor : public ir_hierarchical_visitor { public: ir_channel_expressions_visitor() { this->progress = false; this->mem_ctx = NULL; } ir_visitor_status visit_leave(ir_assignment *); ir_rvalue *get_element(ir_variable *var, unsigned int element); void assign(ir_assignment *ir, int elem, ir_rvalue *val); bool progress; void *mem_ctx; }; static bool channel_expressions_predicate(ir_instruction *ir) { ir_expression *expr = ir->as_expression(); unsigned int i; if (!expr) return false; switch (expr->operation) { /* these opcodes need to act on the whole vector, * just like texturing. */ case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: return false; default: break; } for (i = 0; i < expr->get_num_operands(); i++) { if (expr->operands[i]->type->is_vector()) return true; } return false; } bool brw_do_channel_expressions(exec_list *instructions) { ir_channel_expressions_visitor v; /* Pull out any matrix expression to a separate assignment to a * temp. This will make our handling of the breakdown to * operations on the matrix's vector components much easier. */ do_expression_flattening(instructions, channel_expressions_predicate); visit_list_elements(&v, instructions); return v.progress; } ir_rvalue * ir_channel_expressions_visitor::get_element(ir_variable *var, unsigned int elem) { ir_dereference *deref; if (var->type->is_scalar()) return new(mem_ctx) ir_dereference_variable(var); assert(elem < var->type->components()); deref = new(mem_ctx) ir_dereference_variable(var); return new(mem_ctx) ir_swizzle(deref, elem, 0, 0, 0, 1); } void ir_channel_expressions_visitor::assign(ir_assignment *ir, int elem, ir_rvalue *val) { ir_dereference *lhs = ir->lhs->clone(mem_ctx, NULL); ir_assignment *assign; /* This assign-of-expression should have been generated by the * expression flattening visitor (since we never short circit to * not flatten, even for plain assignments of variables), so the * writemask is always full. */ assert(ir->write_mask == (1 << ir->lhs->type->components()) - 1); assign = new(mem_ctx) ir_assignment(lhs, val, NULL, (1 << elem)); ir->insert_before(assign); } ir_visitor_status ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) { ir_expression *expr = ir->rhs->as_expression(); bool found_vector = false; unsigned int i, vector_elements = 1; ir_variable *op_var[3]; if (!expr) return visit_continue; if (!this->mem_ctx) this->mem_ctx = ralloc_parent(ir); for (i = 0; i < expr->get_num_operands(); i++) { if (expr->operands[i]->type->is_vector()) { found_vector = true; vector_elements = expr->operands[i]->type->vector_elements; break; } } if (!found_vector) return visit_continue; switch (expr->operation) { case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: return visit_continue; default: break; } /* Store the expression operands in temps so we can use them * multiple times. */ for (i = 0; i < expr->get_num_operands(); i++) { ir_assignment *assign; ir_dereference *deref; assert(!expr->operands[i]->type->is_matrix()); op_var[i] = new(mem_ctx) ir_variable(expr->operands[i]->type, "channel_expressions", ir_var_temporary); ir->insert_before(op_var[i]); deref = new(mem_ctx) ir_dereference_variable(op_var[i]); assign = new(mem_ctx) ir_assignment(deref, expr->operands[i], NULL); ir->insert_before(assign); } const glsl_type *element_type = glsl_type::get_instance(ir->lhs->type->base_type, 1, 1); /* OK, time to break down this vector operation. */ switch (expr->operation) { case ir_unop_bit_not: case ir_unop_logic_not: case ir_unop_neg: case ir_unop_abs: case ir_unop_sign: case ir_unop_rcp: case ir_unop_rsq: case ir_unop_sqrt: case ir_unop_exp: case ir_unop_log: case ir_unop_exp2: case ir_unop_log2: case ir_unop_bitcast_i2f: case ir_unop_bitcast_f2i: case ir_unop_bitcast_f2u: case ir_unop_bitcast_u2f: case ir_unop_i2u: case ir_unop_u2i: case ir_unop_f2i: case ir_unop_f2u: case ir_unop_i2f: case ir_unop_f2b: case ir_unop_b2f: case ir_unop_i2b: case ir_unop_b2i: case ir_unop_u2f: case ir_unop_trunc: case ir_unop_ceil: case ir_unop_floor: case ir_unop_fract: case ir_unop_round_even: case ir_unop_sin: case ir_unop_cos: case ir_unop_dFdx: case ir_unop_dFdx_coarse: case ir_unop_dFdx_fine: case ir_unop_dFdy: case ir_unop_dFdy_coarse: case ir_unop_dFdy_fine: case ir_unop_bitfield_reverse: case ir_unop_bit_count: case ir_unop_find_msb: case ir_unop_find_lsb: case ir_unop_saturate: case ir_unop_subroutine_to_int: for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); assign(ir, i, new(mem_ctx) ir_expression(expr->operation, element_type, op0, NULL)); } break; case ir_binop_add: case ir_binop_sub: case ir_binop_mul: case ir_binop_imul_high: case ir_binop_div: case ir_binop_carry: case ir_binop_borrow: case ir_binop_mod: case ir_binop_min: case ir_binop_max: case ir_binop_pow: case ir_binop_lshift: case ir_binop_rshift: case ir_binop_bit_and: case ir_binop_bit_xor: case ir_binop_bit_or: case ir_binop_logic_and: case ir_binop_logic_xor: case ir_binop_logic_or: case ir_binop_less: case ir_binop_greater: case ir_binop_lequal: case ir_binop_gequal: case ir_binop_equal: case ir_binop_nequal: for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); ir_rvalue *op1 = get_element(op_var[1], i); assign(ir, i, new(mem_ctx) ir_expression(expr->operation, element_type, op0, op1)); } break; case ir_unop_any: { ir_expression *temp; temp = new(mem_ctx) ir_expression(ir_binop_logic_or, element_type, get_element(op_var[0], 0), get_element(op_var[0], 1)); for (i = 2; i < vector_elements; i++) { temp = new(mem_ctx) ir_expression(ir_binop_logic_or, element_type, get_element(op_var[0], i), temp); } assign(ir, 0, temp); break; } case ir_binop_dot: { ir_expression *last = NULL; for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); ir_rvalue *op1 = get_element(op_var[1], i); ir_expression *temp; temp = new(mem_ctx) ir_expression(ir_binop_mul, element_type, op0, op1); if (last) { last = new(mem_ctx) ir_expression(ir_binop_add, element_type, temp, last); } else { last = temp; } } assign(ir, 0, last); break; } case ir_binop_all_equal: case ir_binop_any_nequal: { ir_expression *last = NULL; for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); ir_rvalue *op1 = get_element(op_var[1], i); ir_expression *temp; ir_expression_operation join; if (expr->operation == ir_binop_all_equal) join = ir_binop_logic_and; else join = ir_binop_logic_or; temp = new(mem_ctx) ir_expression(expr->operation, element_type, op0, op1); if (last) { last = new(mem_ctx) ir_expression(join, element_type, temp, last); } else { last = temp; } } assign(ir, 0, last); break; } case ir_unop_noise: unreachable("noise should have been broken down to function call"); case ir_binop_bfm: { /* Does not need to be scalarized, since its result will be identical * for all channels. */ ir_rvalue *op0 = get_element(op_var[0], 0); ir_rvalue *op1 = get_element(op_var[1], 0); assign(ir, 0, new(mem_ctx) ir_expression(expr->operation, element_type, op0, op1)); break; } case ir_binop_ubo_load: case ir_unop_get_buffer_size: unreachable("not yet supported"); case ir_triop_fma: case ir_triop_lrp: case ir_triop_csel: case ir_triop_bitfield_extract: for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); ir_rvalue *op1 = get_element(op_var[1], i); ir_rvalue *op2 = get_element(op_var[2], i); assign(ir, i, new(mem_ctx) ir_expression(expr->operation, element_type, op0, op1, op2)); } break; case ir_triop_bfi: { /* Only a single BFM is needed for multiple BFIs. */ ir_rvalue *op0 = get_element(op_var[0], 0); for (i = 0; i < vector_elements; i++) { ir_rvalue *op1 = get_element(op_var[1], i); ir_rvalue *op2 = get_element(op_var[2], i); assign(ir, i, new(mem_ctx) ir_expression(expr->operation, element_type, op0->clone(mem_ctx, NULL), op1, op2)); } break; } case ir_unop_pack_snorm_2x16: case ir_unop_pack_snorm_4x8: case ir_unop_pack_unorm_2x16: case ir_unop_pack_unorm_4x8: case ir_unop_pack_half_2x16: case ir_unop_unpack_snorm_2x16: case ir_unop_unpack_snorm_4x8: case ir_unop_unpack_unorm_2x16: case ir_unop_unpack_unorm_4x8: case ir_unop_unpack_half_2x16: case ir_binop_ldexp: case ir_binop_vector_extract: case ir_triop_vector_insert: case ir_quadop_bitfield_insert: case ir_quadop_vector: case ir_unop_ssbo_unsized_array_length: unreachable("should have been lowered"); case ir_unop_unpack_half_2x16_split_x: case ir_unop_unpack_half_2x16_split_y: case ir_binop_pack_half_2x16_split: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: unreachable("not reached: expression operates on scalars only"); case ir_unop_pack_double_2x32: case ir_unop_unpack_double_2x32: case ir_unop_frexp_sig: case ir_unop_frexp_exp: case ir_unop_d2f: case ir_unop_f2d: case ir_unop_d2i: case ir_unop_i2d: case ir_unop_d2u: case ir_unop_u2d: case ir_unop_d2b: unreachable("no fp64 support yet"); } ir->remove(); this->progress = true; return visit_continue; }