diff options
author | Christoph Bumiller <[email protected]> | 2010-08-18 14:37:47 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2010-08-18 14:37:47 +0200 |
commit | 3e54d63429fe7ca5db3c75c181abbaf7a7f55724 (patch) | |
tree | e129c36aaef712525f0a04fc5b06c445e3cf84df /src/mesa/drivers/dri | |
parent | eaab76457818fad0926b84c663440e8987e1f19f (diff) | |
parent | 85d9bc236d6a8ff8f12cbc2150f8c3740354f573 (diff) |
Merge remote branch 'origin/master' into nv50-compiler
Diffstat (limited to 'src/mesa/drivers/dri')
74 files changed, 2114 insertions, 828 deletions
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c index 86e59a8e51c..a2f404b616f 100644 --- a/src/mesa/drivers/dri/common/dri_metaops.c +++ b/src/mesa/drivers/dri/common/dri_metaops.c @@ -29,6 +29,7 @@ #include "main/arbprogram.h" #include "main/arrayobj.h" #include "main/bufferobj.h" +#include "main/context.h" #include "main/enable.h" #include "main/matrix.h" #include "main/texstate.h" diff --git a/src/mesa/drivers/dri/i810/i810render.c b/src/mesa/drivers/dri/i810/i810render.c index b543d4f012c..205f0cebc1c 100644 --- a/src/mesa/drivers/dri/i810/i810render.c +++ b/src/mesa/drivers/dri/i810/i810render.c @@ -37,6 +37,8 @@ #include "main/imports.h" #include "main/mtypes.h" +#include "math/m_xform.h" + #include "tnl/t_context.h" #include "i810screen.h" diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile index 71ee753748c..65fd658c047 100644 --- a/src/mesa/drivers/dri/i915/Makefile +++ b/src/mesa/drivers/dri/i915/Makefile @@ -56,7 +56,7 @@ C_SOURCES = \ ASM_SOURCES = -DRIVER_DEFINES = -I../intel -I../intel/server -DI915 \ +DRIVER_DEFINES = -I../intel -DI915 \ $(shell pkg-config libdrm --atleast-version=2.3.1 \ && echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP") diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c index ec209391ab4..add0adacb56 100644 --- a/src/mesa/drivers/dri/i915/intel_render.c +++ b/src/mesa/drivers/dri/i915/intel_render.c @@ -37,6 +37,8 @@ #include "main/mtypes.h" #include "main/enums.h" +#include "math/m_xform.h" + #include "tnl/t_context.h" #include "tnl/t_vertex.h" #include "tnl/t_pipeline.h" diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile index 831981558d8..e381a5c714b 100644 --- a/src/mesa/drivers/dri/i965/Makefile +++ b/src/mesa/drivers/dri/i965/Makefile @@ -106,7 +106,7 @@ C_SOURCES = \ ASM_SOURCES = -DRIVER_DEFINES = -I../intel -I../intel/server +DRIVER_DEFINES = -I../intel INCLUDES += $(INTEL_CFLAGS) DRI_LIB_DEPS += $(INTEL_LIBS) diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c index a74bbc25643..d2ac1235e46 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_util.c +++ b/src/mesa/drivers/dri/i965/brw_clip_util.c @@ -192,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, brw_clip_project_vertex(c, dest_ptr ); } - - - -#define MAX_MRF 16 - void brw_clip_emit_vue(struct brw_clip_compile *c, struct brw_indirect vert, GLboolean allocate, diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 6b20a2979f8..f7a68cead7c 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -604,6 +604,8 @@ #define BRW_ARF_NOTIFICATION_COUNT 0x90 #define BRW_ARF_IP 0xA0 +#define BRW_MRF_COMPR4 (1 << 7) + #define BRW_AMASK 0 #define BRW_IMASK 1 #define BRW_LMASK 2 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 31ff86cf731..ffdddd0a388 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -984,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn, /* brw_optimize.c */ void brw_optimize(struct brw_compile *p); +void brw_remove_duplicate_mrf_moves(struct brw_compile *p); +void brw_remove_grf_to_mrf_moves(struct brw_compile *p); #endif diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c index a364b158209..8aa6fb6cc6f 100644 --- a/src/mesa/drivers/dri/i965/brw_optimize.c +++ b/src/mesa/drivers/dri/i965/brw_optimize.c @@ -32,6 +32,594 @@ #include "brw_defines.h" #include "brw_eu.h" +static const struct { + char *name; + int nsrc; + int ndst; + GLboolean is_arith; +} inst_opcode[128] = { + [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 }, + + [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, +}; + +static INLINE +GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst) +{ + return inst_opcode[inst->header.opcode].is_arith; +} + +static const GLuint inst_stride[7] = { + [0] = 0, + [1] = 1, + [2] = 2, + [3] = 4, + [4] = 8, + [5] = 16, + [6] = 32 +}; + +static const GLuint inst_type_size[8] = { + [BRW_REGISTER_TYPE_UD] = 4, + [BRW_REGISTER_TYPE_D] = 4, + [BRW_REGISTER_TYPE_UW] = 2, + [BRW_REGISTER_TYPE_W] = 2, + [BRW_REGISTER_TYPE_UB] = 1, + [BRW_REGISTER_TYPE_B] = 1, + [BRW_REGISTER_TYPE_F] = 4 +}; + +static INLINE GLboolean +brw_is_grf_written(const struct brw_instruction *inst, + int reg_index, int size, + int gen) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + int length, write_end; + + /* SEND is specific */ + if (inst->header.opcode == BRW_OPCODE_SEND) { + if (gen >= 5) + length = inst->bits3.generic_gen5.response_length*REG_SIZE; + else + length = inst->bits3.generic.response_length*REG_SIZE; + } + else { + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + } + + /* If the two intervals intersect, we overwrite the register */ + write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end); + + return left < right; +} + +/* Specific path for message register since we need to handle the compr4 case */ +static INLINE GLboolean +brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f; + const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4; + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + + /* We use compr4 with a size != 16 elements. Strange, we conservatively + * consider that we are writing the register. + */ + if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16) + return GL_TRUE; + + GLboolean is_written = GL_FALSE; + + /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */ + if (is_compr4) { + const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride; + + /* First 8-way register */ + const int write_start0 = mrf_index*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end0 = write_start0 + length; + + /* Second 8-way register */ + const int write_start1 = (mrf_index+4)*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end1 = write_start1 + length; + + /* If the two intervals intersect, we overwrite the register */ + const int left0 = MAX2(write_start0, reg_start); + const int right0 = MIN2(write_end0, reg_end); + const int left1 = MAX2(write_start1, reg_start); + const int right1 = MIN2(write_end1, reg_end); + + is_written = left0 < right0 || left1 < right1; + } + else { + int length; + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + + /* If the two intervals intersect, we write into the register */ + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + + is_written = left < right; + } + + /* SEND may perform an implicit mov to a mrf register */ + if (is_written == GL_FALSE && + inst->header.opcode == BRW_OPCODE_SEND && + inst->bits1.da1.src0_reg_file != 0) { + + const int mrf_start = inst->header.destreg__conditionalmod; + const int write_start = mrf_start * REG_SIZE; + const int write_end = write_start + REG_SIZE; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + is_written = left < right; + } + + return is_written; +} + +static INLINE GLboolean +brw_is_mrf_read(const struct brw_instruction *inst, + int reg_index, int size, int gen) +{ + if (inst->header.opcode != BRW_OPCODE_SEND) + return GL_FALSE; + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + return GL_TRUE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + int length, read_start, read_end; + if (gen >= 5) + length = inst->bits3.generic_gen5.msg_length*REG_SIZE; + else + length = inst->bits3.generic.msg_length*REG_SIZE; + + /* Look if SEND uses an implicit mov. In that case, we read one less register + * (but we write it) + */ + if (inst->bits1.da1.src0_reg_file != 0) + read_start = inst->header.destreg__conditionalmod; + else { + length--; + read_start = inst->header.destreg__conditionalmod + 1; + } + read_start *= REG_SIZE; + read_end = read_start + length; + + const int left = MAX2(read_start, reg_start); + const int right = MIN2(read_end, reg_end); + + return left < right; +} + +static INLINE GLboolean +brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size) +{ + int i, j; + if (inst_opcode[inst->header.opcode].nsrc == 0) + return GL_FALSE; + + /* Look at first source. We must take into account register regions to + * monitor carefully the read. Note that we are a bit too conservative here + * since we do not take into account the fact that some complete registers + * may be skipped + */ + if (inst_opcode[inst->header.opcode].nsrc >= 1) { + + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits2.da1.src0_width; + const int row_num = elem_num >> inst->bits2.da1.src0_width; + const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride]; + int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE + + inst->bits2.da1.src0_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + /* Second src register */ + if (inst_opcode[inst->header.opcode].nsrc >= 2) { + + if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits3.da1.src1_width; + const int row_num = elem_num >> inst->bits3.da1.src1_width; + const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride]; + int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE + + inst->bits3.da1.src1_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + return GL_FALSE; +} + +static INLINE GLboolean +brw_is_control_done(const struct brw_instruction *mov) { + return + mov->header.dependency_control != 0 || + mov->header.thread_control != 0 || + mov->header.mask_control != 0 || + mov->header.saturate != 0 || + mov->header.debug_control != 0; +} + +static INLINE GLboolean +brw_is_predicated(const struct brw_instruction *mov) { + return mov->header.predicate_control != 0; +} + +static INLINE GLboolean +brw_is_grf_to_mrf_mov(const struct brw_instruction *mov, + int *mrf_index, + int *grf_index, + GLboolean *is_compr4) +{ + if (brw_is_predicated(mov) || + brw_is_control_done(mov) || + mov->header.debug_control != 0) + return GL_FALSE; + + if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE || + mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F || + mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits1.da1.dest_subreg_nr != 0) + return GL_FALSE; + + if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE || + mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F || + mov->bits2.da1.src0_width != BRW_WIDTH_8 || + mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 || + mov->bits2.da1.src0_subreg_nr != 0 || + mov->bits2.da1.src0_abs != 0 || + mov->bits2.da1.src0_negate != 0) + return GL_FALSE; + + *grf_index = mov->bits2.da1.src0_reg_nr; + *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f; + *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0; + return GL_TRUE; +} + +static INLINE GLboolean +brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index) +{ + /* remark: no problem to predicate a SEL instruction */ + if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) && + brw_is_control_done(inst) == GL_FALSE && + inst->header.execution_size == 4 && + inst->header.access_mode == BRW_ALIGN_1 && + inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT && + inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE && + inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F && + inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 && + inst->bits1.da1.dest_reg_nr == grf_index && + inst->bits1.da1.dest_subreg_nr == 0 && + brw_is_arithmetic_inst(inst)) + return GL_TRUE; + + return GL_FALSE; +} + +static INLINE GLboolean +brw_inst_are_equal(const struct brw_instruction *src0, + const struct brw_instruction *src1) +{ + const GLuint *field0 = (GLuint *) src0; + const GLuint *field1 = (GLuint *) src1; + return field0[0] == field1[0] && + field0[1] == field1[1] && + field0[2] == field1[2] && + field0[3] == field1[3]; +} + +static INLINE void +brw_inst_copy(struct brw_instruction *dst, + const struct brw_instruction *src) +{ + GLuint *field_dst = (GLuint *) dst; + const GLuint *field_src = (GLuint *) src; + field_dst[0] = field_src[0]; + field_dst[1] = field_src[1]; + field_dst[2] = field_src[2]; + field_dst[3] = field_src[3]; +} + +static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst) +{ + int i, nr_insn = 0, to = 0, from = 0; + + for (from = 0; from < p->nr_insn; ++from) { + if (removeInst[from]) + continue; + if(to != from) + brw_inst_copy(p->store + to, p->store + from); + to++; + } + + for (i = 0; i < p->nr_insn; ++i) + if (removeInst[i] == GL_FALSE) + nr_insn++; + p->nr_insn = nr_insn; +} + +/* The gen code emitter generates a lot of duplications in the + * grf-to-mrf moves, for example when texture sampling with the same + * coordinates from multiple textures.. Here, we monitor same mov + * grf-to-mrf instrutions and remove repeated ones where the operands + * and dst ahven't changed in between. + */ +void brw_remove_duplicate_mrf_moves(struct brw_compile *p) +{ + const int gen = p->brw->intel.gen; + int i, j; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1; + const int simd16_size = 2 * REG_SIZE; + + for (j = i + 1; j < p->nr_insn; j++) { + const struct brw_instruction *inst = p->store + j; + + if (brw_inst_are_equal(mov, inst)) { + removeInst[j] = GL_TRUE; + continue; + } + + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE)) + break; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + +/* Replace moves to MRFs where the value moved is the result of a + * normal arithmetic operation with computation right into the MRF. + */ +void brw_remove_grf_to_mrf_moves(struct brw_compile *p) +{ + int i, j, prev; + struct brw_context *brw = p->brw; + const int gen = brw->intel.gen; + const int simd16_size = 2*REG_SIZE; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + assert(removeInst); + + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + struct brw_instruction *grf_inst = NULL; + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + /* Using comp4 enables a stride of 4 for this instruction */ + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1; + + /* Look where the register has been set */ + prev = i; + GLboolean potential_remove = GL_FALSE; + while (prev--) { + + /* If _one_ instruction writes the grf, we try to remove the mov */ + struct brw_instruction *inst = p->store + prev; + if (brw_is_grf_straight_write(inst, grf_index)) { + potential_remove = GL_TRUE; + grf_inst = inst; + break; + } + + } + + if (potential_remove == GL_FALSE) + continue; + removeInst[i] = GL_TRUE; + + /* Monitor first the section of code between the grf computation and the + * mov. Here we cannot read or write both mrf and grf register + */ + for (j = prev + 1; j < i; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_grf_read(inst, grf_index, simd16_size) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE) || + brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) || + brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) { + removeInst[i] = GL_FALSE; + break; + } + } + + /* After the mov, we can read or write the mrf. If the grf is overwritten, + * we are done + */ + for (j = i + 1; j < p->nr_insn; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + + if (brw_is_grf_read(inst, grf_index, simd16_size)) { + removeInst[i] = GL_FALSE; + break; + } + + if (brw_is_grf_straight_write(inst, grf_index)) + break; + } + + /* Note that with the top down traversal, we can safely pacth the mov + * instruction + */ + if (removeInst[i]) { + grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file; + grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + static GLboolean is_single_channel_dp4(struct brw_instruction *insn) { diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 40eece276b7..af08446f2d8 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -46,68 +46,68 @@ brw_add_validated_bo(struct brw_context *brw, drm_intel_bo *bo) } }; -const struct brw_tracked_state brw_blend_constant_color; -const struct brw_tracked_state brw_cc_unit; -const struct brw_tracked_state brw_check_fallback; -const struct brw_tracked_state brw_clip_prog; -const struct brw_tracked_state brw_clip_unit; -const struct brw_tracked_state brw_vs_constants; -const struct brw_tracked_state brw_wm_constants; -const struct brw_tracked_state brw_constant_buffer; -const struct brw_tracked_state brw_curbe_offsets; -const struct brw_tracked_state brw_invarient_state; -const struct brw_tracked_state brw_gs_prog; -const struct brw_tracked_state brw_gs_unit; -const struct brw_tracked_state brw_line_stipple; -const struct brw_tracked_state brw_aa_line_parameters; -const struct brw_tracked_state brw_pipelined_state_pointers; -const struct brw_tracked_state brw_binding_table_pointers; -const struct brw_tracked_state brw_depthbuffer; -const struct brw_tracked_state brw_polygon_stipple_offset; -const struct brw_tracked_state brw_polygon_stipple; -const struct brw_tracked_state brw_program_parameters; -const struct brw_tracked_state brw_recalculate_urb_fence; -const struct brw_tracked_state brw_sf_prog; -const struct brw_tracked_state brw_sf_unit; -const struct brw_tracked_state brw_sf_vp; -const struct brw_tracked_state brw_state_base_address; -const struct brw_tracked_state brw_urb_fence; -const struct brw_tracked_state brw_vertex_state; -const struct brw_tracked_state brw_vs_surfaces; -const struct brw_tracked_state brw_vs_prog; -const struct brw_tracked_state brw_vs_unit; -const struct brw_tracked_state brw_wm_input_sizes; -const struct brw_tracked_state brw_wm_prog; -const struct brw_tracked_state brw_wm_samplers; -const struct brw_tracked_state brw_wm_constant_surface; -const struct brw_tracked_state brw_wm_surfaces; -const struct brw_tracked_state brw_wm_binding_table; -const struct brw_tracked_state brw_wm_unit; - -const struct brw_tracked_state brw_psp_urb_cbs; - -const struct brw_tracked_state brw_pipe_control; - -const struct brw_tracked_state brw_drawing_rect; -const struct brw_tracked_state brw_indices; -const struct brw_tracked_state brw_vertices; -const struct brw_tracked_state brw_index_buffer; -const struct brw_tracked_state gen6_binding_table_pointers; -const struct brw_tracked_state gen6_blend_state; -const struct brw_tracked_state gen6_cc_state_pointers; -const struct brw_tracked_state gen6_clip_state; -const struct brw_tracked_state gen6_clip_vp; -const struct brw_tracked_state gen6_color_calc_state; -const struct brw_tracked_state gen6_depth_stencil_state; -const struct brw_tracked_state gen6_gs_state; -const struct brw_tracked_state gen6_sampler_state; -const struct brw_tracked_state gen6_scissor_state; -const struct brw_tracked_state gen6_sf_state; -const struct brw_tracked_state gen6_sf_vp; -const struct brw_tracked_state gen6_urb; -const struct brw_tracked_state gen6_viewport_state; -const struct brw_tracked_state gen6_vs_state; -const struct brw_tracked_state gen6_wm_state; +extern const struct brw_tracked_state brw_blend_constant_color; +extern const struct brw_tracked_state brw_cc_unit; +extern const struct brw_tracked_state brw_check_fallback; +extern const struct brw_tracked_state brw_clip_prog; +extern const struct brw_tracked_state brw_clip_unit; +extern const struct brw_tracked_state brw_vs_constants; +extern const struct brw_tracked_state brw_wm_constants; +extern const struct brw_tracked_state brw_constant_buffer; +extern const struct brw_tracked_state brw_curbe_offsets; +extern const struct brw_tracked_state brw_invarient_state; +extern const struct brw_tracked_state brw_gs_prog; +extern const struct brw_tracked_state brw_gs_unit; +extern const struct brw_tracked_state brw_line_stipple; +extern const struct brw_tracked_state brw_aa_line_parameters; +extern const struct brw_tracked_state brw_pipelined_state_pointers; +extern const struct brw_tracked_state brw_binding_table_pointers; +extern const struct brw_tracked_state brw_depthbuffer; +extern const struct brw_tracked_state brw_polygon_stipple_offset; +extern const struct brw_tracked_state brw_polygon_stipple; +extern const struct brw_tracked_state brw_program_parameters; +extern const struct brw_tracked_state brw_recalculate_urb_fence; +extern const struct brw_tracked_state brw_sf_prog; +extern const struct brw_tracked_state brw_sf_unit; +extern const struct brw_tracked_state brw_sf_vp; +extern const struct brw_tracked_state brw_state_base_address; +extern const struct brw_tracked_state brw_urb_fence; +extern const struct brw_tracked_state brw_vertex_state; +extern const struct brw_tracked_state brw_vs_surfaces; +extern const struct brw_tracked_state brw_vs_prog; +extern const struct brw_tracked_state brw_vs_unit; +extern const struct brw_tracked_state brw_wm_input_sizes; +extern const struct brw_tracked_state brw_wm_prog; +extern const struct brw_tracked_state brw_wm_samplers; +extern const struct brw_tracked_state brw_wm_constant_surface; +extern const struct brw_tracked_state brw_wm_surfaces; +extern const struct brw_tracked_state brw_wm_binding_table; +extern const struct brw_tracked_state brw_wm_unit; + +extern const struct brw_tracked_state brw_psp_urb_cbs; + +extern const struct brw_tracked_state brw_pipe_control; + +extern const struct brw_tracked_state brw_drawing_rect; +extern const struct brw_tracked_state brw_indices; +extern const struct brw_tracked_state brw_vertices; +extern const struct brw_tracked_state brw_index_buffer; +extern const struct brw_tracked_state gen6_binding_table_pointers; +extern const struct brw_tracked_state gen6_blend_state; +extern const struct brw_tracked_state gen6_cc_state_pointers; +extern const struct brw_tracked_state gen6_clip_state; +extern const struct brw_tracked_state gen6_clip_vp; +extern const struct brw_tracked_state gen6_color_calc_state; +extern const struct brw_tracked_state gen6_depth_stencil_state; +extern const struct brw_tracked_state gen6_gs_state; +extern const struct brw_tracked_state gen6_sampler_state; +extern const struct brw_tracked_state gen6_scissor_state; +extern const struct brw_tracked_state gen6_sf_state; +extern const struct brw_tracked_state gen6_sf_vp; +extern const struct brw_tracked_state gen6_urb; +extern const struct brw_tracked_state gen6_viewport_state; +extern const struct brw_tracked_state gen6_vs_state; +extern const struct brw_tracked_state gen6_wm_state; /*********************************************************************** * brw_state.c diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c index 1db2a210d45..e878da3850d 100644 --- a/src/mesa/drivers/dri/i965/brw_util.c +++ b/src/mesa/drivers/dri/i965/brw_util.c @@ -30,6 +30,8 @@ */ +#include <assert.h> + #include "main/mtypes.h" #include "program/prog_parameter.h" #include "brw_util.h" diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index a1bee2e44ab..b6b558e9a69 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -44,6 +44,7 @@ static GLboolean brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg) { int opcode_array[] = { + [OPCODE_MOV] = 1, [OPCODE_ADD] = 2, [OPCODE_CMP] = 3, [OPCODE_DP3] = 2, diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index 323cfac8fa7..d9fa2e63354 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -1283,7 +1283,7 @@ void emit_fb_write(struct brw_wm_compile *c, * + 1 for the second half we get destination + 4. */ brw_MOV(p, - brw_message_reg(nr + channel + (1 << 7)), + brw_message_reg(nr + channel + BRW_MRF_COMPR4), arg0[channel]); } else { /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ @@ -1712,12 +1712,20 @@ void brw_wm_emit( struct brw_wm_compile *c ) inst->dst[i]->spill_slot); } + /* Only properly tested on ILK */ + if (p->brw->intel.gen == 5) { + brw_remove_duplicate_mrf_moves(p); + if (c->dispatch_width == 16) + brw_remove_grf_to_mrf_moves(p); + } + if (INTEL_DEBUG & DEBUG_WM) { int i; - printf("wm-native:\n"); - for (i = 0; i < p->nr_insn; i++) + printf("wm-native:\n"); + for (i = 0; i < p->nr_insn; i++) brw_disasm(stderr, &p->store[i], p->brw->intel.gen); printf("\n"); } } + diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c index 5f2035d79c9..e19f44035fd 100644 --- a/src/mesa/drivers/dri/intel/intel_context.c +++ b/src/mesa/drivers/dri/intel/intel_context.c @@ -29,6 +29,7 @@ #include "main/glheader.h" #include "main/context.h" #include "main/extensions.h" +#include "main/fbobject.h" #include "main/framebuffer.h" #include "main/imports.h" #include "main/points.h" @@ -39,8 +40,6 @@ #include "drivers/common/driverfuncs.h" #include "drivers/common/meta.h" -#include "i830_dri.h" - #include "intel_chipset.h" #include "intel_buffers.h" #include "intel_tex.h" @@ -420,7 +419,7 @@ intel_prepare_render(struct intel_context *intel) __DRIdrawable *drawable; drawable = driContext->driDrawablePriv; - if (drawable->dri2.stamp != driContext->dri2.draw_stamp) { + if (drawable && drawable->dri2.stamp != driContext->dri2.draw_stamp) { if (drawable->lastStamp != drawable->dri2.stamp) intel_update_renderbuffers(driContext, drawable); intel_draw_buffer(&intel->ctx, intel->ctx.DrawBuffer); @@ -428,7 +427,7 @@ intel_prepare_render(struct intel_context *intel) } drawable = driContext->driReadablePriv; - if (drawable->dri2.stamp != driContext->dri2.read_stamp) { + if (drawable && drawable->dri2.stamp != driContext->dri2.read_stamp) { if (drawable->lastStamp != drawable->dri2.stamp) intel_update_renderbuffers(driContext, drawable); driContext->dri2.read_stamp = drawable->dri2.stamp; @@ -613,6 +612,7 @@ intelInitContext(struct intel_context *intel, __DRIscreen *sPriv = driContextPriv->driScreenPriv; struct intel_screen *intelScreen = sPriv->private; int bo_reuse_mode; + __GLcontextModes visual; /* we can't do anything without a connection to the device */ if (intelScreen->bufmgr == NULL) @@ -624,6 +624,11 @@ intelInitContext(struct intel_context *intel, functions->Viewport = intel_viewport; } + if (mesaVis == NULL) { + memset(&visual, 0, sizeof visual); + mesaVis = &visual; + } + if (!_mesa_initialize_context_for_api(&intel->ctx, api, mesaVis, shareCtx, functions, (void *) intel)) { printf("%s: failed to init mesa context\n", __FUNCTION__); @@ -890,14 +895,21 @@ intelMakeCurrent(__DRIcontext * driContextPriv, } if (driContextPriv) { - struct gl_framebuffer *fb = driDrawPriv->driverPrivate; - struct gl_framebuffer *readFb = driReadPriv->driverPrivate; + struct gl_framebuffer *fb, *readFb; + + if (driDrawPriv == NULL && driReadPriv == NULL) { + fb = _mesa_get_incomplete_framebuffer(); + readFb = _mesa_get_incomplete_framebuffer(); + } else { + fb = driDrawPriv->driverPrivate; + readFb = driReadPriv->driverPrivate; + driContextPriv->dri2.draw_stamp = driDrawPriv->dri2.stamp - 1; + driContextPriv->dri2.read_stamp = driReadPriv->dri2.stamp - 1; + } - driContextPriv->dri2.draw_stamp = driDrawPriv->dri2.stamp - 1; - driContextPriv->dri2.read_stamp = driReadPriv->dri2.stamp - 1; intel_prepare_render(intel); _mesa_make_current(&intel->ctx, fb, readFb); - + /* We do this in intel_prepare_render() too, but intel->ctx.DrawBuffer * is NULL at that point. We can't call _mesa_makecurrent() * first, since we need the buffer size for the initial diff --git a/src/mesa/drivers/dri/intel/intel_extensions_es2.c b/src/mesa/drivers/dri/intel/intel_extensions_es2.c index baf8e130010..de34bbb2aec 100644 --- a/src/mesa/drivers/dri/intel/intel_extensions_es2.c +++ b/src/mesa/drivers/dri/intel/intel_extensions_es2.c @@ -28,7 +28,6 @@ #include "main/extensions.h" #include "intel_extensions.h" -#include "utils.h" static const char *es2_extensions[] = { /* Used by mesa internally (cf all_mesa_extensions in ../common/utils.c) */ diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c index 0e2fe893fed..02c0ffce31d 100644 --- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c +++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c @@ -45,6 +45,7 @@ #include "main/attrib.h" #include "main/enable.h" #include "main/viewport.h" +#include "main/context.h" #include "swrast/swrast.h" #include "intel_screen.h" diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c index fe4de189600..680d18ba299 100644 --- a/src/mesa/drivers/dri/intel/intel_regions.c +++ b/src/mesa/drivers/dri/intel/intel_regions.c @@ -155,6 +155,9 @@ intel_region_alloc_internal(struct intel_context *intel, } region = calloc(sizeof(*region), 1); + if (region == NULL) + return region; + region->cpp = cpp; region->width = width; region->height = height; @@ -189,6 +192,9 @@ intel_region_alloc(struct intel_context *intel, region = intel_region_alloc_internal(intel, cpp, width, height, aligned_pitch / cpp, buffer); + if (region == NULL) + return region; + region->tiling = tiling; return region; diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c index 224b506c05b..6efb2ddc553 100644 --- a/src/mesa/drivers/dri/intel/intel_tex_copy.c +++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c @@ -102,7 +102,7 @@ do_copy_texsubimage(struct intel_context *intel, GLcontext *ctx = &intel->ctx; const struct intel_region *src = get_teximage_source(intel, internalFormat); - if (!intelImage->mt || !src) { + if (!intelImage->mt || !src || !src->buffer) { if (INTEL_DEBUG & DEBUG_FALLBACKS) fprintf(stderr, "%s fail %p %p (0x%08x)\n", __FUNCTION__, intelImage->mt, src, internalFormat); diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c index 5f813c0efa2..e03b203fb40 100644 --- a/src/mesa/drivers/dri/intel/intel_tex_format.c +++ b/src/mesa/drivers/dri/intel/intel_tex_format.c @@ -19,7 +19,6 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat, GLenum format, GLenum type) { struct intel_context *intel = intel_context(ctx); - const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24); #if 0 printf("%s intFmt=0x%x format=0x%x type=0x%x\n", @@ -30,39 +29,28 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat, case 4: case GL_RGBA: case GL_COMPRESSED_RGBA: - if (format == GL_BGRA) { - if (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) { - return MESA_FORMAT_ARGB8888; - } - else if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) { - return MESA_FORMAT_ARGB4444; - } - else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) { - return MESA_FORMAT_ARGB1555; - } - } - return do32bpt ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB4444; + if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) + return MESA_FORMAT_ARGB4444; + else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) + return MESA_FORMAT_ARGB1555; + else + return MESA_FORMAT_ARGB8888; case 3: case GL_RGB: case GL_COMPRESSED_RGB: - if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) { - return MESA_FORMAT_RGB565; - } - if (do32bpt) { - if (intel->has_xrgb_textures) - return MESA_FORMAT_XRGB8888; - else - return MESA_FORMAT_ARGB8888; - } else { + if (type == GL_UNSIGNED_SHORT_5_6_5) return MESA_FORMAT_RGB565; - } + else if (intel->has_xrgb_textures) + return MESA_FORMAT_XRGB8888; + else + return MESA_FORMAT_ARGB8888; case GL_RGBA8: case GL_RGB10_A2: case GL_RGBA12: case GL_RGBA16: - return do32bpt ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB4444; + return MESA_FORMAT_ARGB8888; case GL_RGBA4: case GL_RGBA2: diff --git a/src/mesa/drivers/dri/intel/server/i830_dri.h b/src/mesa/drivers/dri/intel/server/i830_dri.h deleted file mode 100644 index def049e7a6b..00000000000 --- a/src/mesa/drivers/dri/intel/server/i830_dri.h +++ /dev/null @@ -1,62 +0,0 @@ -/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/i810/i830_dri.h,v 1.6 2003/09/28 20:15:59 alanh Exp $ */ - -#ifndef _I830_DRI_H -#define _I830_DRI_H - -#include "xf86drm.h" - -#define I830_MAX_DRAWABLES 256 - -#define I830_MAJOR_VERSION 1 -#define I830_MINOR_VERSION 9 -#define I830_PATCHLEVEL 0 - -#define I830_REG_SIZE 0x80000 - -typedef struct _I830DRIRec { - drm_handle_t regs; - drmSize regsSize; - - drmSize unused1; /* backbufferSize */ - drm_handle_t unused2; /* backbuffer */ - - drmSize unused3; /* depthbufferSize */ - drm_handle_t unused4; /* depthbuffer */ - - drmSize unused5; /* rotatedSize */ - drm_handle_t unused6; /* rotatedbuffer */ - - drm_handle_t unused7; /* textures */ - int unused8; /* textureSize */ - - drm_handle_t unused9; /* agp_buffers */ - drmSize unused10; /* agp_buf_size */ - - int deviceID; - int width; - int height; - int mem; - int cpp; - int bitsPerPixel; - - int unused11[8]; /* was front/back/depth/rotated offset/pitch */ - - int unused12; /* logTextureGranularity */ - int unused13; /* textureOffset */ - - int irq; - int sarea_priv_offset; -} I830DRIRec, *I830DRIPtr; - -typedef struct { - /* Nothing here yet */ - int dummy; -} I830ConfigPrivRec, *I830ConfigPrivPtr; - -typedef struct { - /* Nothing here yet */ - int dummy; -} I830DRIContextRec, *I830DRIContextPtr; - - -#endif diff --git a/src/mesa/drivers/dri/intel/server/intel.h b/src/mesa/drivers/dri/intel/server/intel.h deleted file mode 100644 index 6ea72499c1c..00000000000 --- a/src/mesa/drivers/dri/intel/server/intel.h +++ /dev/null @@ -1,331 +0,0 @@ -#ifndef _INTEL_H_ -#define _INTEL_H_ - -#include "xf86drm.h" /* drm_handle_t, etc */ - -/* Intel */ -#ifndef PCI_CHIP_I810 -#define PCI_CHIP_I810 0x7121 -#define PCI_CHIP_I810_DC100 0x7123 -#define PCI_CHIP_I810_E 0x7125 -#define PCI_CHIP_I815 0x1132 -#define PCI_CHIP_I810_BRIDGE 0x7120 -#define PCI_CHIP_I810_DC100_BRIDGE 0x7122 -#define PCI_CHIP_I810_E_BRIDGE 0x7124 -#define PCI_CHIP_I815_BRIDGE 0x1130 -#endif - -#define PCI_CHIP_845_G 0x2562 -#define PCI_CHIP_I830_M 0x3577 - -#ifndef PCI_CHIP_I855_GM -#define PCI_CHIP_I855_GM 0x3582 -#define PCI_CHIP_I855_GM_BRIDGE 0x3580 -#endif - -#ifndef PCI_CHIP_I865_G -#define PCI_CHIP_I865_G 0x2572 -#define PCI_CHIP_I865_G_BRIDGE 0x2570 -#endif - -#ifndef PCI_CHIP_I915_G -#define PCI_CHIP_I915_G 0x2582 -#define PCI_CHIP_I915_G_BRIDGE 0x2580 -#endif - -#ifndef PCI_CHIP_I915_GM -#define PCI_CHIP_I915_GM 0x2592 -#define PCI_CHIP_I915_GM_BRIDGE 0x2590 -#endif - -#ifndef PCI_CHIP_E7221_G -#define PCI_CHIP_E7221_G 0x258A -/* Same as I915_G_BRIDGE */ -#define PCI_CHIP_E7221_G_BRIDGE 0x2580 -#endif - -#ifndef PCI_CHIP_I945_G -#define PCI_CHIP_I945_G 0x2772 -#define PCI_CHIP_I945_G_BRIDGE 0x2770 -#endif - -#ifndef PCI_CHIP_I945_GM -#define PCI_CHIP_I945_GM 0x27A2 -#define PCI_CHIP_I945_GM_BRIDGE 0x27A0 -#endif - -#define IS_I810(pI810) (pI810->Chipset == PCI_CHIP_I810 || \ - pI810->Chipset == PCI_CHIP_I810_DC100 || \ - pI810->Chipset == PCI_CHIP_I810_E) -#define IS_I815(pI810) (pI810->Chipset == PCI_CHIP_I815) -#define IS_I830(pI810) (pI810->Chipset == PCI_CHIP_I830_M) -#define IS_845G(pI810) (pI810->Chipset == PCI_CHIP_845_G) -#define IS_I85X(pI810) (pI810->Chipset == PCI_CHIP_I855_GM) -#define IS_I852(pI810) (pI810->Chipset == PCI_CHIP_I855_GM && (pI810->variant == I852_GM || pI810->variant == I852_GME)) -#define IS_I855(pI810) (pI810->Chipset == PCI_CHIP_I855_GM && (pI810->variant == I855_GM || pI810->variant == I855_GME)) -#define IS_I865G(pI810) (pI810->Chipset == PCI_CHIP_I865_G) - -#define IS_I915G(pI810) (pI810->Chipset == PCI_CHIP_I915_G || pI810->Chipset == PCI_CHIP_E7221_G) -#define IS_I915GM(pI810) (pI810->Chipset == PCI_CHIP_I915_GM) -#define IS_I945G(pI810) (pI810->Chipset == PCI_CHIP_I945_G) -#define IS_I945GM(pI810) (pI810->Chipset == PCI_CHIP_I945_GM) -#define IS_I9XX(pI810) (IS_I915G(pI810) || IS_I915GM(pI810) || IS_I945G(pI810) || IS_I945GM(pI810)) - -#define IS_MOBILE(pI810) (IS_I830(pI810) || IS_I85X(pI810) || IS_I915GM(pI810) || IS_I945GM(pI810)) - -#define I830_GMCH_CTRL 0x52 - -#define I830_GMCH_MEM_MASK 0x1 -#define I830_GMCH_MEM_64M 0x1 -#define I830_GMCH_MEM_128M 0 - -#define I830_GMCH_GMS_MASK 0x70 -#define I830_GMCH_GMS_DISABLED 0x00 -#define I830_GMCH_GMS_LOCAL 0x10 -#define I830_GMCH_GMS_STOLEN_512 0x20 -#define I830_GMCH_GMS_STOLEN_1024 0x30 -#define I830_GMCH_GMS_STOLEN_8192 0x40 - -#define I855_GMCH_GMS_MASK (0x7 << 4) -#define I855_GMCH_GMS_DISABLED 0x00 -#define I855_GMCH_GMS_STOLEN_1M (0x1 << 4) -#define I855_GMCH_GMS_STOLEN_4M (0x2 << 4) -#define I855_GMCH_GMS_STOLEN_8M (0x3 << 4) -#define I855_GMCH_GMS_STOLEN_16M (0x4 << 4) -#define I855_GMCH_GMS_STOLEN_32M (0x5 << 4) -#define I915G_GMCH_GMS_STOLEN_48M (0x6 << 4) -#define I915G_GMCH_GMS_STOLEN_64M (0x7 << 4) - -typedef unsigned char Bool; -#define TRUE 1 -#define FALSE 0 - -#define PIPE_NONE 0<<0 -#define PIPE_CRT 1<<0 -#define PIPE_TV 1<<1 -#define PIPE_DFP 1<<2 -#define PIPE_LFP 1<<3 -#define PIPE_CRT2 1<<4 -#define PIPE_TV2 1<<5 -#define PIPE_DFP2 1<<6 -#define PIPE_LFP2 1<<7 - -typedef struct _I830MemPool *I830MemPoolPtr; -typedef struct _I830MemRange *I830MemRangePtr; -typedef struct _I830MemRange { - long Start; - long End; - long Size; - unsigned long Physical; - unsigned long Offset; /* Offset of AGP-allocated portion */ - unsigned long Alignment; - drm_handle_t Key; - unsigned long Pitch; // add pitch - I830MemPoolPtr Pool; -} I830MemRange; - -typedef struct _I830MemPool { - I830MemRange Total; - I830MemRange Free; - I830MemRange Fixed; - I830MemRange Allocated; -} I830MemPool; - -typedef struct { - int tail_mask; - I830MemRange mem; - unsigned char *virtual_start; - int head; - int tail; - int space; -} I830RingBuffer; - -typedef struct _I830Rec { - unsigned char *MMIOBase; - unsigned char *FbBase; - int cpp; - uint32_t aper_size; - unsigned int bios_version; - - /* These are set in PreInit and never changed. */ - long FbMapSize; - long TotalVideoRam; - I830MemRange StolenMemory; /* pre-allocated memory */ - long BIOSMemorySize; /* min stolen pool size */ - int BIOSMemSizeLoc; - - /* These change according to what has been allocated. */ - long FreeMemory; - I830MemRange MemoryAperture; - I830MemPool StolenPool; - long allocatedMemory; - - /* Regions allocated either from the above pools, or from agpgart. */ - /* for single and dual head configurations */ - I830MemRange FrontBuffer; - I830MemRange FrontBuffer2; - I830MemRange Scratch; - I830MemRange Scratch2; - - I830RingBuffer *LpRing; - - I830MemRange BackBuffer; - I830MemRange DepthBuffer; - I830MemRange TexMem; - int TexGranularity; - I830MemRange ContextMem; - int drmMinor; - Bool have3DWindows; - - Bool NeedRingBufferLow; - Bool allowPageFlip; - Bool disableTiling; - - int Chipset; - unsigned long LinearAddr; - unsigned long MMIOAddr; - - drmSize registerSize; /**< \brief MMIO register map size */ - drm_handle_t registerHandle; /**< \brief MMIO register map handle */ - // IOADDRESS ioBase; - int irq; /**< \brief IRQ number */ - int GttBound; - - drm_handle_t ring_map; - unsigned int Fence[8]; - -} I830Rec; - -/* - * 12288 is set as the maximum, chosen because it is enough for - * 1920x1440@32bpp with a 2048 pixel line pitch with some to spare. - */ -#define I830_MAXIMUM_VBIOS_MEM 12288 -#define I830_DEFAULT_VIDEOMEM_2D (MB(32) / 1024) -#define I830_DEFAULT_VIDEOMEM_3D (MB(64) / 1024) - -/* Flags for memory allocation function */ -#define FROM_ANYWHERE 0x00000000 -#define FROM_POOL_ONLY 0x00000001 -#define FROM_NEW_ONLY 0x00000002 -#define FROM_MASK 0x0000000f - -#define ALLOCATE_AT_TOP 0x00000010 -#define ALLOCATE_AT_BOTTOM 0x00000020 -#define FORCE_GAPS 0x00000040 - -#define NEED_PHYSICAL_ADDR 0x00000100 -#define ALIGN_BOTH_ENDS 0x00000200 -#define FORCE_LOW 0x00000400 - -#define ALLOC_NO_TILING 0x00001000 -#define ALLOC_INITIAL 0x00002000 - -#define ALLOCATE_DRY_RUN 0x80000000 - -/* Chipset registers for VIDEO BIOS memory RW access */ -#define _855_DRAM_RW_CONTROL 0x58 -#define _845_DRAM_RW_CONTROL 0x90 -#define DRAM_WRITE 0x33330000 - -#define KB(x) ((x) * 1024) -#define MB(x) ((x) * KB(1024)) - -#define GTT_PAGE_SIZE KB(4) -#define ROUND_TO(x, y) (((x) + (y) - 1) / (y) * (y)) -#define ROUND_DOWN_TO(x, y) ((x) / (y) * (y)) -#define ROUND_TO_PAGE(x) ROUND_TO((x), GTT_PAGE_SIZE) -#define ROUND_TO_MB(x) ROUND_TO((x), MB(1)) -#define PRIMARY_RINGBUFFER_SIZE KB(128) - - -/* Ring buffer registers, p277, overview p19 - */ -#define LP_RING 0x2030 -#define HP_RING 0x2040 - -#define RING_TAIL 0x00 -#define TAIL_ADDR 0x000FFFF8 -#define I830_TAIL_MASK 0x001FFFF8 - -#define RING_HEAD 0x04 -#define HEAD_WRAP_COUNT 0xFFE00000 -#define HEAD_WRAP_ONE 0x00200000 -#define HEAD_ADDR 0x001FFFFC -#define I830_HEAD_MASK 0x001FFFFC - -#define RING_START 0x08 -#define START_ADDR 0x03FFFFF8 -#define I830_RING_START_MASK 0xFFFFF000 - -#define RING_LEN 0x0C -#define RING_NR_PAGES 0x001FF000 -#define I830_RING_NR_PAGES 0x001FF000 -#define RING_REPORT_MASK 0x00000006 -#define RING_REPORT_64K 0x00000002 -#define RING_REPORT_128K 0x00000004 -#define RING_NO_REPORT 0x00000000 -#define RING_VALID_MASK 0x00000001 -#define RING_VALID 0x00000001 -#define RING_INVALID 0x00000000 - - -/* Fence/Tiling ranges [0..7] - */ -#define FENCE 0x2000 -#define FENCE_NR 8 - -#define I915G_FENCE_START_MASK 0x0ff00000 - -#define I830_FENCE_START_MASK 0x07f80000 - -#define FENCE_START_MASK 0x03F80000 -#define FENCE_X_MAJOR 0x00000000 -#define FENCE_Y_MAJOR 0x00001000 -#define FENCE_SIZE_MASK 0x00000700 -#define FENCE_SIZE_512K 0x00000000 -#define FENCE_SIZE_1M 0x00000100 -#define FENCE_SIZE_2M 0x00000200 -#define FENCE_SIZE_4M 0x00000300 -#define FENCE_SIZE_8M 0x00000400 -#define FENCE_SIZE_16M 0x00000500 -#define FENCE_SIZE_32M 0x00000600 -#define FENCE_SIZE_64M 0x00000700 -#define I915G_FENCE_SIZE_1M 0x00000000 -#define I915G_FENCE_SIZE_2M 0x00000100 -#define I915G_FENCE_SIZE_4M 0x00000200 -#define I915G_FENCE_SIZE_8M 0x00000300 -#define I915G_FENCE_SIZE_16M 0x00000400 -#define I915G_FENCE_SIZE_32M 0x00000500 -#define I915G_FENCE_SIZE_64M 0x00000600 -#define I915G_FENCE_SIZE_128M 0x00000700 -#define FENCE_PITCH_1 0x00000000 -#define FENCE_PITCH_2 0x00000010 -#define FENCE_PITCH_4 0x00000020 -#define FENCE_PITCH_8 0x00000030 -#define FENCE_PITCH_16 0x00000040 -#define FENCE_PITCH_32 0x00000050 -#define FENCE_PITCH_64 0x00000060 -#define FENCE_VALID 0x00000001 - -#include <mmio.h> - -# define MMIO_IN8(base, offset) \ - *(volatile unsigned char *)(((unsigned char*)(base)) + (offset)) -# define MMIO_IN32(base, offset) \ - read_MMIO_LE32(base, offset) -# define MMIO_OUT8(base, offset, val) \ - *(volatile unsigned char *)(((unsigned char*)(base)) + (offset)) = (val) -# define MMIO_OUT32(base, offset, val) \ - *(volatile unsigned int *)(void *)(((unsigned char*)(base)) + (offset)) = CPU_TO_LE32(val) - - - /* Memory mapped register access macros */ -#define INREG8(addr) MMIO_IN8(MMIO, addr) -#define INREG(addr) MMIO_IN32(MMIO, addr) -#define OUTREG8(addr, val) MMIO_OUT8(MMIO, addr, val) -#define OUTREG(addr, val) MMIO_OUT32(MMIO, addr, val) - -#define DSPABASE 0x70184 - -#endif diff --git a/src/mesa/drivers/dri/mach64/mach64_ioctl.h b/src/mesa/drivers/dri/mach64/mach64_ioctl.h index 1ffda1932f1..9145ee6e6cf 100644 --- a/src/mesa/drivers/dri/mach64/mach64_ioctl.h +++ b/src/mesa/drivers/dri/mach64/mach64_ioctl.h @@ -32,6 +32,9 @@ #ifndef __MACH64_IOCTL_H__ #define __MACH64_IOCTL_H__ +#include <stdio.h> +#include <stdlib.h> + #include "mach64_dri.h" #include "mach64_reg.h" #include "mach64_lock.h" diff --git a/src/mesa/drivers/dri/mga/mgarender.c b/src/mesa/drivers/dri/mga/mgarender.c index 8b8fc485d31..cc0cea618d1 100644 --- a/src/mesa/drivers/dri/mga/mgarender.c +++ b/src/mesa/drivers/dri/mga/mgarender.c @@ -44,6 +44,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/imports.h" #include "main/mtypes.h" +#include "math/m_xform.h" + #include "tnl/t_context.h" #include "mgacontext.h" diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c index 8be7edb150b..bd1273beea7 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c +++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c @@ -220,7 +220,7 @@ get_tex_format(struct gl_texture_image *ti) case MESA_FORMAT_RGB565: return GL_RGB5; default: - assert(0); + return GL_NONE; } } @@ -231,7 +231,6 @@ nouveau_render_texture(GLcontext *ctx, struct gl_framebuffer *fb, struct gl_renderbuffer *rb = att->Renderbuffer; struct gl_texture_image *ti = att->Texture->Image[att->CubeMapFace][att->TextureLevel]; - int ret; /* Allocate a renderbuffer object for the texture if we * haven't already done so. */ @@ -244,9 +243,7 @@ nouveau_render_texture(GLcontext *ctx, struct gl_framebuffer *fb, } /* Update the renderbuffer fields from the texture. */ - ret = set_renderbuffer_format(rb, get_tex_format(ti)); - assert(ret); - + set_renderbuffer_format(rb, get_tex_format(ti)); rb->Width = ti->Width; rb->Height = ti->Height; nouveau_surface_ref(&to_nouveau_teximage(ti)->surface, diff --git a/src/mesa/drivers/dri/nouveau/nouveau_texture.c b/src/mesa/drivers/dri/nouveau/nouveau_texture.c index dbf9a5cc613..442f4e899ee 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_texture.c +++ b/src/mesa/drivers/dri/nouveau/nouveau_texture.c @@ -38,6 +38,7 @@ #include "main/mipmap.h" #include "main/texfetch.h" #include "main/teximage.h" +#include "drivers/common/meta.h" static struct gl_texture_object * nouveau_texture_new(GLcontext *ctx, GLuint name, GLenum target) @@ -182,10 +183,10 @@ teximage_fits(struct gl_texture_object *t, int level) struct nouveau_surface *s = &to_nouveau_texture(t)->surfaces[level]; struct gl_texture_image *ti = t->Image[0][level]; - return ti && (t->Target == GL_TEXTURE_RECTANGLE || - (s->bo && s->width == ti->Width && - s->height == ti->Height && - s->format == ti->TexFormat)); + return ti && to_nouveau_teximage(ti)->surface.bo && + (t->Target == GL_TEXTURE_RECTANGLE || + (s->bo && s->format == ti->TexFormat && + s->width == ti->Width && s->height == ti->Height)); } static GLboolean @@ -589,6 +590,53 @@ nouveau_texture_unmap(GLcontext *ctx, struct gl_texture_object *t) } } +static void +store_mipmap(GLcontext *ctx, GLenum target, int first, int last, + struct gl_texture_object *t) +{ + struct gl_pixelstore_attrib packing = { + .BufferObj = ctx->Shared->NullBufferObj, + .Alignment = 1 + }; + GLenum format = t->Image[0][first]->TexFormat; + unsigned base_format, type, comps; + int i; + + base_format = _mesa_get_format_base_format(format); + _mesa_format_to_type_and_comps(format, &type, &comps); + + for (i = first; i <= last; i++) { + struct gl_texture_image *ti = t->Image[0][i]; + void *data = ti->Data; + + nouveau_teximage(ctx, 3, target, i, ti->InternalFormat, + ti->Width, ti->Height, ti->Depth, + ti->Border, base_format, type, data, + &packing, t, ti); + + _mesa_free_texmemory(data); + } +} + +static void +nouveau_generate_mipmap(GLcontext *ctx, GLenum target, + struct gl_texture_object *t) +{ + if (_mesa_meta_check_generate_mipmap_fallback(ctx, target, t)) { + struct gl_texture_image *base = t->Image[0][t->BaseLevel]; + + nouveau_teximage_map(ctx, base); + _mesa_generate_mipmap(ctx, target, t); + nouveau_teximage_unmap(ctx, base); + + store_mipmap(ctx, target, t->BaseLevel + 1, + get_last_level(t), t); + + } else { + _mesa_meta_GenerateMipmap(ctx, target, t); + } +} + void nouveau_texture_functions_init(struct dd_function_table *functions) { @@ -607,4 +655,5 @@ nouveau_texture_functions_init(struct dd_function_table *functions) functions->BindTexture = nouveau_bind_texture; functions->MapTexture = nouveau_texture_map; functions->UnmapTexture = nouveau_texture_unmap; + functions->GenerateMipmap = nouveau_generate_mipmap; } diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c index 21da4f7af16..95691cad047 100644 --- a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c +++ b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c @@ -72,7 +72,7 @@ nv20_emit_framebuffer(GLcontext *ctx, int emit) fb->_ColorDrawBuffers[0])->surface; rt_format |= get_rt_format(s->format); - zeta_pitch = rt_pitch = s->pitch; + rt_pitch = s->pitch; nouveau_bo_markl(bctx, kelvin, NV20TCL_COLOR_OFFSET, s->bo, 0, bo_flags); @@ -88,6 +88,9 @@ nv20_emit_framebuffer(GLcontext *ctx, int emit) nouveau_bo_markl(bctx, kelvin, NV20TCL_ZETA_OFFSET, s->bo, 0, bo_flags); + } else { + rt_format |= get_rt_format(MESA_FORMAT_Z24_S8); + zeta_pitch = rt_pitch; } BEGIN_RING(chan, kelvin, NV20TCL_RT_FORMAT, 2); diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c index e46118e4fce..2d45513bb4c 100644 --- a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c +++ b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c @@ -194,7 +194,8 @@ nv20_emit_tex_obj(GLcontext *ctx, int emit) | nvgl_wrap_mode(t->WrapS) << 0; tx_filter = nvgl_filter_mode(t->MagFilter) << 24 - | nvgl_filter_mode(t->MinFilter) << 16; + | nvgl_filter_mode(t->MinFilter) << 16 + | 2 << 12; tx_enable = NV20TCL_TX_ENABLE_ENABLE | log2i(t->MaxAnisotropy) << 4; diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c index 262fe3cddee..dbf4ad477db 100644 --- a/src/mesa/drivers/dri/r200/r200_swtcl.c +++ b/src/mesa/drivers/dri/r200/r200_swtcl.c @@ -612,6 +612,8 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim ) { r200ContextPtr rmesa = R200_CONTEXT(ctx); + radeon_prepare_render(&rmesa->radeon); + if (rmesa->radeon.swtcl.hw_primitive != hwprim) { /* need to disable perspective-correct texturing for point sprites */ if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) { diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c index d43e14581e9..4ae0f304918 100644 --- a/src/mesa/drivers/dri/r200/r200_tcl.c +++ b/src/mesa/drivers/dri/r200/r200_tcl.c @@ -264,6 +264,8 @@ void r200TclPrimitive( GLcontext *ctx, r200ContextPtr rmesa = R200_CONTEXT(ctx); GLuint newprim = hw_prim | R200_VF_TCL_OUTPUT_VTX_ENABLE; + radeon_prepare_render(&rmesa->radeon); + if (newprim != rmesa->tcl.hw_primitive || !discrete_prim[hw_prim&0xf]) { /* need to disable perspective-correct texturing for point sprites */ diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c index a326ee4c4fa..d2fa816894c 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c @@ -109,13 +109,13 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "before compilation"); if (c->Base.is_r500){ - r500_transform_unroll_loops(&c->Base, &loop_state); - debug_program_log(c, "after r500 transform loops"); + rc_unroll_loops(&c->Base, R500_PFS_MAX_INST); + debug_program_log(c, "after unroll loops"); } else{ - rc_transform_unroll_loops(&c->Base, &loop_state); + rc_transform_loops(&c->Base, &loop_state, -1); debug_program_log(c, "after transform loops"); - + rc_emulate_branches(&c->Base); debug_program_log(c, "after emulate branches"); } diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c index d347b4df9cd..666c9c2a7a9 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c @@ -32,6 +32,11 @@ #include "radeon_emulate_branches.h" #include "radeon_emulate_loops.h" +struct loop { + int BgnLoop; + +}; + /* * Take an already-setup and valid source then swizzle it appropriately to * obtain a constant ZERO or ONE source. @@ -332,11 +337,140 @@ static void ei_pow(struct r300_vertex_program_code *vp, inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); } +static void mark_write(void * userdata, struct rc_instruction * inst, + rc_register_file file, unsigned int index, unsigned int mask) +{ + unsigned int * writemasks = userdata; + + if (file != RC_FILE_TEMPORARY) + return; + + if (index >= R300_VS_MAX_TEMPS) + return; + + writemasks[index] |= mask; +} + +static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler) +{ + return PVS_SRC_OPERAND(compiler->PredicateIndex, + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_W), + t_src_class(RC_FILE_TEMPORARY), + 0); +} + +static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler, + unsigned int hw_opcode, int is_math) +{ + return PVS_OP_DST_OPERAND(hw_opcode, + is_math, + 0, + compiler->PredicateIndex, + RC_MASK_W, + t_dst_class(RC_FILE_TEMPORARY)); + +} + +static void ei_if(struct r300_vertex_program_compiler * compiler, + struct rc_instruction *rci, + unsigned int * inst, + unsigned int branch_depth) +{ + unsigned int predicate_opcode; + int is_math = 0; + + if (!compiler->Base.is_r500) { + rc_error(&compiler->Base,"Opcode IF not supported\n"); + return; + } + + /* Reserve a temporary to use as our predicate stack counter, if we + * don't already have one. */ + if (!compiler->PredicateMask) { + unsigned int writemasks[R300_VS_MAX_TEMPS]; + memset(writemasks, 0, sizeof(writemasks)); + struct rc_instruction * inst; + unsigned int i; + for(inst = compiler->Base.Program.Instructions.Next; + inst != &compiler->Base.Program.Instructions; + inst = inst->Next) { + rc_for_all_writes_mask(inst, mark_write, writemasks); + } + for(i = 0; i < R300_VS_MAX_TEMPS; i++) { + unsigned int mask = ~writemasks[i] & RC_MASK_XYZW; + /* Only the W component can be used fo the predicate + * stack counter. */ + if (mask & RC_MASK_W) { + compiler->PredicateMask = RC_MASK_W; + compiler->PredicateIndex = i; + break; + } + } + if (i == R300_VS_MAX_TEMPS) { + rc_error(&compiler->Base, "No free temporary to use for" + " predicate stack counter.\n"); + return; + } + } + predicate_opcode = + branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ; + + rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0)); + if (branch_depth == 0) { + is_math = 1; + predicate_opcode = ME_PRED_SET_NEQ; + inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]); + inst[2] = 0; + } else { + predicate_opcode = VE_PRED_SET_NEQ_PUSH; + inst[1] = t_pred_src(compiler); + inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]); + } + + inst[0] = t_pred_dst(compiler, predicate_opcode, is_math); + inst[3] = 0; + +} + +static void ei_else(struct r300_vertex_program_compiler * compiler, + unsigned int * inst) +{ + if (!compiler->Base.is_r500) { + rc_error(&compiler->Base,"Opcode ELSE not supported\n"); + return; + } + inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1); + inst[1] = t_pred_src(compiler); + inst[2] = 0; + inst[3] = 0; +} + +static void ei_endif(struct r300_vertex_program_compiler *compiler, + unsigned int * inst) +{ + if (!compiler->Base.is_r500) { + rc_error(&compiler->Base,"Opcode ENDIF not supported\n"); + return; + } + inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1); + inst[1] = t_pred_src(compiler); + inst[2] = 0; + inst[3] = 0; +} static void translate_vertex_program(struct r300_vertex_program_compiler * compiler) { struct rc_instruction *rci; + struct loop * loops; + int current_loop_depth = 0; + int loops_reserved = 0; + + unsigned int branch_depth = 0; + compiler->code->pos_end = 0; /* Not supported yet */ compiler->code->length = 0; @@ -366,9 +500,12 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; + case RC_OPCODE_ELSE: ei_else(compiler, inst); break; + case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break; case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; + case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break; case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; @@ -385,11 +522,86 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; + case RC_OPCODE_BGNLOOP: + { + struct loop * l; + + if ((!compiler->Base.is_r500 + && loops_reserved >= R300_VS_MAX_LOOP_DEPTH) + || loops_reserved >= R500_VS_MAX_FC_DEPTH) { + rc_error(&compiler->Base, + "Loops are nested too deep."); + return; + } + memory_pool_array_reserve(&compiler->Base.Pool, + struct loop, loops, current_loop_depth, + loops_reserved, 1); + l = &loops[current_loop_depth++]; + memset(l , 0, sizeof(struct loop)); + l->BgnLoop = (compiler->code->length / 4); + continue; + } + case RC_OPCODE_ENDLOOP: + { + struct loop * l = &loops[current_loop_depth - 1]; + unsigned int act_addr = l->BgnLoop - 1; + unsigned int last_addr = (compiler->code->length / 4) - 1; + unsigned int ret_addr = l->BgnLoop; + + if (loops_reserved >= R300_VS_MAX_FC_OPS) { + rc_error(&compiler->Base, + "Too many flow control instructions."); + return; + } + if (compiler->Base.is_r500) { + compiler->code->fc_op_addrs.r500 + [compiler->code->num_fc_ops].lw = + R500_PVS_FC_ACT_ADRS(act_addr) + | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff) + ; + compiler->code->fc_op_addrs.r500 + [compiler->code->num_fc_ops].uw = + R500_PVS_FC_LAST_INST(last_addr) + | R500_PVS_FC_RTN_INST(ret_addr) + ; + } else { + compiler->code->fc_op_addrs.r300 + [compiler->code->num_fc_ops] = + R300_PVS_FC_ACT_ADRS(act_addr) + | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) + | R300_PVS_FC_LAST_INST(last_addr) + | R300_PVS_FC_RTN_INST(ret_addr) + ; + } + compiler->code->fc_loop_index[compiler->code->num_fc_ops] = + R300_PVS_FC_LOOP_INIT_VAL(0x0) + | R300_PVS_FC_LOOP_STEP_VAL(0x1) + ; + compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( + compiler->code->num_fc_ops); + compiler->code->num_fc_ops++; + current_loop_depth--; + continue; + } + default: rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name); return; } + /* Non-flow control instructions that are inside an if statement + * need to pay attention to the predicate bit. */ + if (branch_depth + && vpi->Opcode != RC_OPCODE_IF + && vpi->Opcode != RC_OPCODE_ELSE + && vpi->Opcode != RC_OPCODE_ENDIF) { + + inst[0] |= (PVS_DST_PRED_ENABLE_MASK + << PVS_DST_PRED_ENABLE_SHIFT); + inst[0] |= (PVS_DST_PRED_SENSE_MASK + << PVS_DST_PRED_SENSE_SHIFT); + } + compiler->code->length += 4; if (compiler->Base.Error) @@ -406,6 +618,7 @@ struct temporary_allocation { static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler) { struct rc_instruction *inst; + struct rc_instruction *end_loop = NULL; unsigned int num_orig_temps = 0; char hwtemps[R300_VS_MAX_TEMPS]; struct temporary_allocation * ta; @@ -440,10 +653,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c /* Pass 2: Determine original temporary lifetimes */ for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); + /* Instructions inside of loops need to use the ENDLOOP + * instruction as their LastRead. */ + if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) { + int endloops = 1; + struct rc_instruction * ptr; + for(ptr = inst->Next; + ptr != &compiler->Base.Program.Instructions; + ptr = ptr->Next){ + if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) { + endloops++; + } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) { + endloops--; + if (endloops <= 0) { + end_loop = ptr; + break; + } + } + } + } + + if (inst == end_loop) { + end_loop = NULL; + continue; + } for (i = 0; i < opcode->NumSrcRegs; ++i) { if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) - ta[inst->U.I.SrcReg[i].Index].LastRead = inst; + ta[inst->U.I.SrcReg[i].Index].LastRead = + end_loop ? end_loop : inst; } } @@ -633,30 +871,24 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = { void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) { struct emulate_loop_state loop_state; - + compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; addArtificialOutputs(compiler); debug_program_log(compiler, "before compilation"); - /* XXX Ideally this should be done only for r3xx, but since - * we don't have branching support for r5xx, we use the emulation - * on all chipsets. */ - rc_transform_unroll_loops(&compiler->Base, &loop_state); - - debug_program_log(compiler, "after transform loops"); - - if (compiler->Base.is_r500){ - rc_emulate_loops(&loop_state, R500_VS_MAX_ALU); - } else { - rc_emulate_loops(&loop_state, R300_VS_MAX_ALU); - } - debug_program_log(compiler, "after emulate loops"); + if (compiler->Base.is_r500) + rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU); + else + rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU); - rc_emulate_branches(&compiler->Base); + debug_program_log(compiler, "after emulate loops"); - debug_program_log(compiler, "after emulate branches"); + if (!compiler->Base.is_r500) { + rc_emulate_branches(&compiler->Base); + debug_program_log(compiler, "after emulate branches"); + } if (compiler->Base.is_r500) { struct radeon_program_transformation transformations[] = { @@ -718,6 +950,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) if (compiler->Base.Debug) { fprintf(stderr, "Final vertex program code:\n"); - r300_vertex_program_dump(compiler->code); + r300_vertex_program_dump(compiler); } } diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c index 5800f1a78e1..e6009338e2e 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c @@ -20,7 +20,9 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "radeon_compiler.h" #include "radeon_code.h" +#include "../r300_reg.h" #include <stdio.h> @@ -133,6 +135,10 @@ static void r300_vs_op_dump(uint32_t op) { fprintf(stderr, " dst: %d%s op: ", (op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]); + if ((op >> PVS_DST_PRED_ENABLE_SHIFT) & 0x1) { + fprintf(stderr, "PRED %u", + (op >> PVS_DST_PRED_SENSE_SHIFT) & 0x1); + } if (op & 0x80) { if (op & 0x1) { fprintf(stderr, "PVS_MACRO_OP_2CLK_M2X_ADD\n"); @@ -160,8 +166,9 @@ static void r300_vs_src_dump(uint32_t src) r300_vs_swiz_debug[(src >> 22) & 0x7]); } -void r300_vertex_program_dump(struct r300_vertex_program_code * vs) +void r300_vertex_program_dump(struct r300_vertex_program_compiler * c) { + struct r300_vertex_program_code * vs = c->code; unsigned instrcount = vs->length / 4; unsigned i; @@ -177,4 +184,21 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs) r300_vs_src_dump(vs->body.d[offset+1+src]); } } + + fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops); + for(i = 0; i < vs->num_fc_ops; i++) { + switch((vs->fc_ops >> (i * 2)) & 0x3 ) { + case 0: fprintf(stderr, "NOP"); break; + case 1: fprintf(stderr, "JUMP"); break; + case 2: fprintf(stderr, "LOOP"); break; + case 3: fprintf(stderr, "JSR"); break; + } + if (c->Base.is_r500) { + fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n", + vs->fc_op_addrs.r500[i].uw, + vs->fc_op_addrs.r500[i].lw); + } else { + fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]); + } + } } diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c index e6b5522c5b9..80a120497e3 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c @@ -30,7 +30,6 @@ #include <stdio.h> #include "../r300_reg.h" -#include "radeon_emulate_loops.h" /** * Rewrite IF instructions to use the ALU result special register. @@ -60,31 +59,6 @@ int r500_transform_IF( return 1; } -/** - * Rewrite loops to make them easier to emit. This is not a local - * transformation, because it modifies and reorders an entire block of code. - */ -void r500_transform_unroll_loops(struct radeon_compiler * c, - struct emulate_loop_state *s) -{ - int i; - - rc_transform_unroll_loops(c, s); - - for( i = s->LoopCount - 1; i >= 0; i-- ){ - struct rc_instruction * inst_continue; - if(!s->Loops[i].EndLoop){ - continue; - } - /* Insert a continue instruction at the end of the loop. This - * is required in order to emit loops correctly. */ - inst_continue = rc_insert_new_instruction(c, - s->Loops[i].EndIf->Prev); - inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE; - } - -} - static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) { unsigned int relevant; diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h index 0d005a794ff..34173351f83 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h @@ -49,6 +49,4 @@ extern int r500_transform_IF( struct rc_instruction * inst, void* data); -void r500_transform_unroll_loops(struct radeon_compiler * c, - struct emulate_loop_state * s); #endif diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c index 0bd8f0a239f..9b60e30f586 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c @@ -64,7 +64,16 @@ struct branch_info { }; struct loop_info { - int LoopStart; + int BgnLoop; + + int BranchDepth; + int * Brks; + int BrkCount; + int BrkReserved; + + int * Conts; + int ContCount; + int ContReserved; }; struct emit_state { @@ -368,6 +377,12 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst unsigned int newip = ++s->Code->inst_end; + /* Currently all loops use the same integer constant to intialize + * the loop variables. */ + if(!s->Code->int_constants[0]) { + s->Code->int_constants[0] = R500_FC_INT_CONST_KR(0xff); + s->Code->int_constant_count = 1; + } s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT; switch(inst->U.I.Opcode){ @@ -378,32 +393,77 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1); loop = &s->Loops[s->CurrentLoopDepth++]; - - /* We don't emit an instruction for BGNLOOP, so we need to - * decrement the instruction counter, but first we need to - * set LoopStart to the current value of inst_end, which - * will end up being the first real instruction in the loop.*/ - loop->LoopStart = s->Code->inst_end--; + memset(loop, 0, sizeof(struct loop_info)); + loop->BranchDepth = s->CurrentBranchDepth; + loop->BgnLoop = newip; + + s->Code->inst[newip].inst2 = R500_FC_OP_LOOP + | R500_FC_JUMP_FUNC(0x00) + | R500_FC_IGNORE_UNCOVERED + ; break; - case RC_OPCODE_BRK: - /* Don't emit an instruction for BRK */ - s->Code->inst_end--; + loop = &s->Loops[s->CurrentLoopDepth - 1]; + memory_pool_array_reserve(&s->C->Pool, int, loop->Brks, + loop->BrkCount, loop->BrkReserved, 1); + + loop->Brks[loop->BrkCount++] = newip; + s->Code->inst[newip].inst2 = R500_FC_OP_BREAKLOOP + | R500_FC_JUMP_FUNC(0xff) + | R500_FC_B_OP1_DECR + | R500_FC_B_POP_CNT( + s->CurrentBranchDepth - loop->BranchDepth) + | R500_FC_IGNORE_UNCOVERED + ; break; - case RC_OPCODE_CONTINUE: + case RC_OPCODE_CONT: loop = &s->Loops[s->CurrentLoopDepth - 1]; - s->Code->inst[newip].inst2 = R500_FC_OP_JUMP | - R500_FC_JUMP_FUNC(0xff); - s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart); + memory_pool_array_reserve(&s->C->Pool, int, loop->Conts, + loop->ContCount, loop->ContReserved, 1); + loop->Conts[loop->ContCount++] = newip; + s->Code->inst[newip].inst2 = R500_FC_OP_CONTINUE + | R500_FC_JUMP_FUNC(0xff) + | R500_FC_B_OP1_DECR + | R500_FC_B_POP_CNT( + s->CurrentBranchDepth - loop->BranchDepth) + | R500_FC_IGNORE_UNCOVERED + ; break; case RC_OPCODE_ENDLOOP: - /* Don't emit an instruction for ENDLOOP */ - s->Code->inst_end--; + { + loop = &s->Loops[s->CurrentLoopDepth - 1]; + /* Emit ENDLOOP */ + s->Code->inst[newip].inst2 = R500_FC_OP_ENDLOOP + | R500_FC_JUMP_FUNC(0xff) + | R500_FC_JUMP_ANY + | R500_FC_IGNORE_UNCOVERED + ; + /* The constant integer at index 0 is used by all loops. */ + s->Code->inst[newip].inst3 = R500_FC_INT_ADDR(0) + | R500_FC_JUMP_ADDR(loop->BgnLoop + 1) + ; + + /* Set jump address and int constant for BGNLOOP */ + s->Code->inst[loop->BgnLoop].inst3 = R500_FC_INT_ADDR(0) + | R500_FC_JUMP_ADDR(newip) + ; + + /* Set jump address for the BRK instructions. */ + while(loop->BrkCount--) { + s->Code->inst[loop->Brks[loop->BrkCount]].inst3 = + R500_FC_JUMP_ADDR(newip + 1); + } + + /* Set jump address for CONT instructions. */ + while(loop->ContCount--) { + s->Code->inst[loop->Conts[loop->ContCount]].inst3 = + R500_FC_JUMP_ADDR(newip); + } s->CurrentLoopDepth--; break; - + } case RC_OPCODE_IF: if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) { rc_error(s->C, "Branch depth exceeds hardware limit"); @@ -442,24 +502,16 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst } branch = &s->Branches[s->CurrentBranchDepth - 1]; - - if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){ - branch->Endif = --s->Code->inst_end; - s->Code->inst[branch->Endif].inst2 |= - R500_FC_B_OP0_DECR; - } - else{ - branch->Endif = newip; - - s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP - | R500_FC_A_OP_NONE /* no address stack */ - | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ - | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ - | R500_FC_B_OP1_NONE /* no branch counter if stay */ - | R500_FC_B_POP_CNT(1) + branch->Endif = newip; + + s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP + | R500_FC_A_OP_NONE /* no address stack */ + | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ + | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ + | R500_FC_B_OP1_NONE /* no branch counter if stay */ + | R500_FC_B_POP_CNT(1) ; - s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); - } + s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP | R500_FC_A_OP_NONE /* no address stack */ | R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */ @@ -544,11 +596,9 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT; } - /* Use FULL flow control mode if branches are nested deep enough. - * We don not need to enable FULL flow control mode for loops, becasue - * we aren't using the hardware loop instructions. - */ - if (s.MaxBranchDepth >= 4) { + /* Enable full flow control mode if we are using loops or have if + * statements nested at least four deep. */ + if (s.MaxBranchDepth >= 4 || s.LoopsReserved > 0) { if (code->max_temp_idx < 1) code->max_temp_idx = 1; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h index d03689763bc..896246d2035 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h @@ -221,6 +221,9 @@ struct r500_fragment_program_code { int max_temp_idx; uint32_t us_fc_ctrl; + + uint32_t int_constants[32]; + uint32_t int_constant_count; }; struct rX00_fragment_program_code { @@ -240,6 +243,12 @@ struct rX00_fragment_program_code { #define R500_VS_MAX_ALU 1024 #define R500_VS_MAX_ALU_DWORDS (R500_VS_MAX_ALU * 4) #define R300_VS_MAX_TEMPS 32 +/* This is the max for all chipsets (r300-r500) */ +#define R300_VS_MAX_FC_OPS 16 +/* The r500 maximum depth is not just for loops, but any combination of loops + * and subroutine jumps. */ +#define R500_VS_MAX_FC_DEPTH 8 +#define R300_VS_MAX_LOOP_DEPTH 1 #define VSF_MAX_INPUTS 32 #define VSF_MAX_OUTPUTS 32 @@ -260,9 +269,18 @@ struct r300_vertex_program_code { uint32_t InputsRead; uint32_t OutputsWritten; -}; -void r300_vertex_program_dump(struct r300_vertex_program_code * vs); + unsigned int num_fc_ops; + uint32_t fc_ops; + union { + uint32_t r300[R300_VS_MAX_FC_OPS]; + struct { + uint32_t lw; + uint32_t uw; + } r500[R300_VS_MAX_FC_OPS]; + } fc_op_addrs; + int32_t fc_loop_index[R300_VS_MAX_FC_OPS]; +}; #endif /* RADEON_CODE_H */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c index 1c8ba864a41..935dc9b0a80 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c @@ -307,3 +307,46 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig } } + +/** + * The FACE input in hardware contains 1 if it's a back face, 0 otherwise. + * Gallium and OpenGL define it the other way around. + * + * So let's just negate FACE at the beginning of the shader and rewrite the rest + * of the shader to read from the newly allocated temporary. + */ +void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face) +{ + unsigned tempregi = rc_find_free_temporary(c); + struct rc_instruction *inst_add; + struct rc_instruction *inst; + + /* perspective divide */ + inst_add = rc_insert_new_instruction(c, &c->Program.Instructions); + inst_add->U.I.Opcode = RC_OPCODE_ADD; + + inst_add->U.I.DstReg.File = RC_FILE_TEMPORARY; + inst_add->U.I.DstReg.Index = tempregi; + inst_add->U.I.DstReg.WriteMask = RC_MASK_X; + + inst_add->U.I.SrcReg[0].File = RC_FILE_NONE; + inst_add->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111; + + inst_add->U.I.SrcReg[1].File = RC_FILE_INPUT; + inst_add->U.I.SrcReg[1].Index = face; + inst_add->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XXXX; + inst_add->U.I.SrcReg[1].Negate = RC_MASK_XYZW; + + for (inst = inst_add->Next; inst != &c->Program.Instructions; inst = inst->Next) { + const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); + unsigned i; + + for(i = 0; i < opcode->NumSrcRegs; i++) { + if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && + inst->U.I.SrcReg[i].Index == face) { + inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; + inst->U.I.SrcReg[i].Index = tempregi; + } + } + } +} diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h index f15905d79d4..7c42eb3ae57 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h @@ -81,6 +81,7 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output); void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input, int full_vtransform); +void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face); struct r300_fragment_program_compiler { struct radeon_compiler Base; @@ -110,8 +111,12 @@ struct r300_vertex_program_compiler { void * UserData; void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c); + + int PredicateIndex; + unsigned int PredicateMask; }; void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c); +void r300_vertex_program_dump(struct r300_vertex_program_compiler * c); #endif /* RADEON_COMPILER_H */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c index fbb4235c223..faf531b412e 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c @@ -43,6 +43,12 @@ struct instruction_state { unsigned char SrcReg[3]; }; +struct loopinfo { + struct updatemask_state * Breaks; + unsigned int BreakCount; + unsigned int BreaksReserved; +}; + struct branchinfo { unsigned int HaveElse:1; @@ -59,6 +65,10 @@ struct deadcode_state { struct branchinfo * BranchStack; unsigned int BranchStackSize; unsigned int BranchStackReserved; + + struct loopinfo * LoopStack; + unsigned int LoopStackSize; + unsigned int LoopStackReserved; }; @@ -78,6 +88,22 @@ static void or_updatemasks( dst->Address = a->Address | b->Address; } +static void push_break(struct deadcode_state *s) +{ + struct loopinfo * loop = &s->LoopStack[s->LoopStackSize - 1]; + memory_pool_array_reserve(&s->C->Pool, struct updatemask_state, + loop->Breaks, loop->BreakCount, loop->BreaksReserved, 1); + + memcpy(&loop->Breaks[loop->BreakCount++], &s->R, sizeof(s->R)); +} + +static void push_loop(struct deadcode_state * s) +{ + memory_pool_array_reserve(&s->C->Pool, struct loopinfo, s->LoopStack, + s->LoopStackSize, s->LoopStackReserved, 1); + memset(&s->LoopStack[s->LoopStackSize++], 0, sizeof(struct loopinfo)); +} + static void push_branch(struct deadcode_state * s) { memory_pool_array_reserve(&s->C->Pool, struct branchinfo, s->BranchStack, @@ -233,11 +259,22 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f } } } + push_loop(&s); break; } - case RC_OPCODE_CONTINUE: case RC_OPCODE_BRK: + push_break(&s); + break; case RC_OPCODE_BGNLOOP: + { + unsigned int i; + struct loopinfo * loop = &s.LoopStack[s.LoopStackSize-1]; + for(i = 0; i < loop->BreakCount; i++) { + or_updatemasks(&s.R, &s.R, &loop->Breaks[i]); + } + break; + } + case RC_OPCODE_CONT: break; case RC_OPCODE_ENDIF: push_branch(&s); diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c index 131e9e7436d..32d4b45dd6d 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c @@ -39,7 +39,6 @@ #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0) struct const_value { - struct radeon_compiler * C; struct rc_src_register * Src; float Value; @@ -78,17 +77,17 @@ static int src_reg_is_immediate(struct rc_src_register * src, c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE; } -static unsigned int loop_calc_iterations(struct emulate_loop_state *s, - struct loop_info * loop, unsigned int max_instructions) +static unsigned int loop_max_possible_iterations(struct radeon_compiler *c, + struct loop_info * loop, unsigned int prog_inst_limit) { - unsigned int total_i = rc_recompute_ips(s->C); + unsigned int total_i = rc_recompute_ips(c); unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1; /* +1 because the program already has one iteration of the loop. */ - return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i)); + return 1 + ((prog_inst_limit - total_i) / loop_i); } -static void loop_unroll(struct emulate_loop_state * s, - struct loop_info *loop, unsigned int iterations) +static void unroll_loop(struct radeon_compiler * c, struct loop_info * loop, + unsigned int iterations) { unsigned int i; struct rc_instruction * ptr; @@ -99,7 +98,7 @@ static void loop_unroll(struct emulate_loop_state * s, rc_remove_instruction(loop->EndLoop); for( i = 1; i < iterations; i++){ for(ptr = first; ptr != last->Next; ptr = ptr->Next){ - struct rc_instruction *new = rc_alloc_instruction(s->C); + struct rc_instruction *new = rc_alloc_instruction(c); memcpy(new, ptr, sizeof(struct rc_instruction)); rc_insert_instruction(append_to, new); append_to = new; @@ -115,7 +114,7 @@ static void update_const_value(void * data, struct rc_instruction * inst, if(value->Src->File != file || value->Src->Index != index || !(1 << GET_SWZ(value->Src->Swizzle, 0) & mask)){ - return; + return; } switch(inst->U.I.Opcode){ case RC_OPCODE_MOV: @@ -140,7 +139,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst, if(file != RC_FILE_TEMPORARY || count_inst->Index != index || (1 << GET_SWZ(count_inst->Swz,0) != mask)){ - return; + return; } /* Find the index of the counter register. */ opcode = rc_get_opcode_info(inst->U.I.Opcode); @@ -185,13 +184,16 @@ static void get_incr_amount(void * data, struct rc_instruction * inst, count_inst->Unknown = 1; return; } - } -static int transform_const_loop(struct emulate_loop_state * s, - struct loop_info * loop) +/** + * If prog_inst_limit is -1, then all eligible loops will be unrolled regardless + * of how many iterations they have. + */ +static int try_unroll_loop(struct radeon_compiler * c, struct loop_info * loop, + unsigned int prog_inst_limit) { - int end_loops = 1; + int end_loops; int iterations; struct count_inst count_inst; float limit_value; @@ -201,12 +203,12 @@ static int transform_const_loop(struct emulate_loop_state * s, struct rc_instruction * inst; /* Find the counter and the upper limit */ - - if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){ + + if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], c)){ limit = &loop->Cond->U.I.SrcReg[0]; counter = &loop->Cond->U.I.SrcReg[1]; } - else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){ + else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], c)){ limit = &loop->Cond->U.I.SrcReg[1]; counter = &loop->Cond->U.I.SrcReg[0]; } @@ -214,13 +216,13 @@ static int transform_const_loop(struct emulate_loop_state * s, DBG("No constant limit.\n"); return 0; } - + /* Find the initial value of the counter */ counter_value.Src = counter; counter_value.Value = 0.0f; counter_value.HasValue = 0; - counter_value.C = s->C; - for(inst = s->C->Program.Instructions.Next; inst != loop->BeginLoop; + counter_value.C = c; + for(inst = c->Program.Instructions.Next; inst != loop->BeginLoop; inst = inst->Next){ rc_for_all_writes_mask(inst, update_const_value, &counter_value); } @@ -230,11 +232,12 @@ static int transform_const_loop(struct emulate_loop_state * s, } DBG("Initial counter value is %f\n", counter_value.Value); /* Determine how the counter is modified each loop */ - count_inst.C = s->C; + count_inst.C = c; count_inst.Index = counter->Index; count_inst.Swz = counter->Swizzle; count_inst.Amount = 0.0f; count_inst.Unknown = 0; + end_loops = 1; for(inst = loop->BeginLoop->Next; end_loops > 0; inst = inst->Next){ switch(inst->U.I.Opcode){ /* XXX In the future we might want to try to unroll nested @@ -246,6 +249,16 @@ static int transform_const_loop(struct emulate_loop_state * s, loop->EndLoop = inst; end_loops--; break; + case RC_OPCODE_BRK: + /* Don't unroll loops if it has a BRK instruction + * other one used when testing the main conditional + * of the loop. */ + + /* Make sure we haven't entered a nested loops. */ + if(inst != loop->Brk && end_loops == 1) { + return 0; + } + break; /* XXX Check if the counter is modified within an if statement. */ case RC_OPCODE_IF: @@ -266,17 +279,20 @@ static int transform_const_loop(struct emulate_loop_state * s, /* Calculate the number of iterations of this loop. Keeping this * simple, since we only support increment and decrement loops. */ - limit_value = get_constant_value(s->C, limit, 0); + limit_value = get_constant_value(c, limit, 0); DBG("Limit is %f.\n", limit_value); + /* The iteration calculations are opposite of what you would expect. + * In a normal loop, if the condition is met, then loop continues, but + * with our loops, if the condition is met, the is exited. */ switch(loop->Cond->U.I.Opcode){ - case RC_OPCODE_SGT: - case RC_OPCODE_SLT: + case RC_OPCODE_SGE: + case RC_OPCODE_SLE: iterations = (int) ceilf((limit_value - counter_value.Value) / count_inst.Amount); break; - case RC_OPCODE_SLE: - case RC_OPCODE_SGE: + case RC_OPCODE_SGT: + case RC_OPCODE_SLT: iterations = (int) floorf((limit_value - counter_value.Value) / count_inst.Amount) + 1; break; @@ -284,77 +300,85 @@ static int transform_const_loop(struct emulate_loop_state * s, return 0; } + if (prog_inst_limit > 0 + && iterations > loop_max_possible_iterations(c, loop, + prog_inst_limit)) { + return 0; + } + DBG("Loop will have %d iterations.\n", iterations); - + /* Prepare loop for unrolling */ rc_remove_instruction(loop->Cond); rc_remove_instruction(loop->If); rc_remove_instruction(loop->Brk); rc_remove_instruction(loop->EndIf); - - loop_unroll(s, loop, iterations); + + unroll_loop(c, loop, iterations); loop->EndLoop = NULL; return 1; } -/** - * This function prepares a loop to be unrolled by converting it into an if - * statement. Here is an outline of the conversion process: - * BGNLOOP; -> BGNLOOP; - * <Additional conditional code> -> <Additional conditional code> - * SGE/SLT temp[0], temp[1], temp[2]; -> SLT/SGE temp[0], temp[1], temp[2]; - * IF temp[0]; -> IF temp[0]; - * BRK; -> - * ENDIF; -> <Loop Body> - * <Loop Body> -> ENDIF; - * ENDLOOP; -> ENDLOOP - * +/** + * @param c + * @param loop * @param inst A pointer to a BGNLOOP instruction. - * @return If the loop can be unrolled, a pointer to the first instruction of - * the unrolled loop. - * Otherwise, A pointer to the ENDLOOP instruction. - * Null if there is an error. + * @return 1 if all of the members of loop where set. + * @return 0 if there was an error and some members of loop are still NULL. */ -static struct rc_instruction * transform_loop(struct emulate_loop_state * s, +static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop, struct rc_instruction * inst) { - struct loop_info *loop; struct rc_instruction * ptr; - memory_pool_array_reserve(&s->C->Pool, struct loop_info, - s->Loops, s->LoopCount, s->LoopReserved, 1); - - loop = &s->Loops[s->LoopCount++]; - memset(loop, 0, sizeof(struct loop_info)); if(inst->U.I.Opcode != RC_OPCODE_BGNLOOP){ - rc_error(s->C, "expected BGNLOOP\n", __FUNCTION__); - return NULL; + rc_error(c, "%s: expected BGNLOOP", __FUNCTION__); + return 0; } + + memset(loop, 0, sizeof(struct loop_info)); + loop->BeginLoop = inst; - for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next){ + for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next) { + + if (ptr == &c->Program.Instructions) { + rc_error(c, "%s: BGNLOOP without an ENDLOOOP.\n", + __FUNCTION__); + return 0; + } + switch(ptr->U.I.Opcode){ case RC_OPCODE_BGNLOOP: - /* Nested loop */ - ptr = transform_loop(s, ptr); - if(!ptr){ - return NULL; + { + /* Nested loop, skip ahead to the end. */ + unsigned int loop_depth = 1; + for(ptr = ptr->Next; ptr != &c->Program.Instructions; + ptr = ptr->Next){ + if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) { + loop_depth++; + } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) { + if (!--loop_depth) { + break; + } + } + } + if (ptr == &c->Program.Instructions) { + rc_error(c, "%s: BGNLOOP without an ENDLOOOP\n", + __FUNCTION__); + return 0; } break; + } case RC_OPCODE_BRK: - loop->Brk = ptr; - if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF){ - rc_error(s->C, - "%s: expected ENDIF\n",__FUNCTION__); - return NULL; - } - loop->EndIf = ptr->Next; - if(ptr->Prev->U.I.Opcode != RC_OPCODE_IF){ - rc_error(s->C, - "%s: expected IF\n", __FUNCTION__); - return NULL; + if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF + || ptr->Prev->U.I.Opcode != RC_OPCODE_IF + || loop->Brk){ + continue; } + loop->Brk = ptr; loop->If = ptr->Prev; + loop->EndIf = ptr->Next; switch(loop->If->Prev->U.I.Opcode){ case RC_OPCODE_SLT: case RC_OPCODE_SGE: @@ -364,18 +388,58 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, case RC_OPCODE_SNE: break; default: - rc_error(s->C, "%s expected conditional\n", + rc_error(c, "%s: expected conditional", __FUNCTION__); - return NULL; + return 0; } loop->Cond = loop->If->Prev; - ptr = loop->EndIf; break; + case RC_OPCODE_ENDLOOP: loop->EndLoop = ptr; break; } } + + if (loop->BeginLoop && loop->Brk && loop->If && loop->EndIf + && loop->Cond && loop->EndLoop) { + return 1; + } + return 0; +} + +/** + * This function prepares a loop to be unrolled by converting it into an if + * statement. Here is an outline of the conversion process: + * BGNLOOP; -> BGNLOOP; + * <Additional conditional code> -> <Additional conditional code> + * SGE/SLT temp[0], temp[1], temp[2]; -> SLT/SGE temp[0], temp[1], temp[2]; + * IF temp[0]; -> IF temp[0]; + * BRK; -> + * ENDIF; -> <Loop Body> + * <Loop Body> -> ENDIF; + * ENDLOOP; -> ENDLOOP + * + * @param inst A pointer to a BGNLOOP instruction. + * @return 1 for success, 0 for failure + */ +static int transform_loop(struct emulate_loop_state * s, + struct rc_instruction * inst) +{ + struct loop_info * loop; + + memory_pool_array_reserve(&s->C->Pool, struct loop_info, + s->Loops, s->LoopCount, s->LoopReserved, 1); + + loop = &s->Loops[s->LoopCount++]; + + if (!build_loop_info(s->C, loop, inst)) + return 0; + + if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){ + return 1; + } + /* Reverse the conditional instruction */ switch(loop->Cond->U.I.Opcode){ case RC_OPCODE_SGE: @@ -398,43 +462,51 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, break; default: rc_error(s->C, "loop->Cond is not a conditional.\n"); - return NULL; - } - - /* Check if the number of loops is known at compile time. */ - if(transform_const_loop(s, loop)){ - return loop->BeginLoop->Next; + return 0; } - /* Prepare the loop to be unrolled */ + /* Prepare the loop to be emulated */ rc_remove_instruction(loop->Brk); rc_remove_instruction(loop->EndIf); rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf); - return loop->EndLoop; + return 1; } -void rc_transform_unroll_loops(struct radeon_compiler *c, - struct emulate_loop_state * s) +void rc_transform_loops(struct radeon_compiler *c, + struct emulate_loop_state * s, int prog_inst_limit) { struct rc_instruction * ptr; - + memset(s, 0, sizeof(struct emulate_loop_state)); s->C = c; - ptr = s->C->Program.Instructions.Next; - while(ptr != &s->C->Program.Instructions) { + s->prog_inst_limit = prog_inst_limit; + for(ptr = s->C->Program.Instructions.Next; + ptr != &s->C->Program.Instructions; ptr = ptr->Next) { if(ptr->Type == RC_INSTRUCTION_NORMAL && ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){ - ptr = transform_loop(s, ptr); - if(!ptr){ + if (!transform_loop(s, ptr)) return; + } + } +} + +void rc_unroll_loops(struct radeon_compiler *c, int prog_inst_limit) +{ + struct rc_instruction * inst; + struct loop_info loop; + + for(inst = c->Program.Instructions.Next; + inst != &c->Program.Instructions; inst = inst->Next) { + + if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP) { + if (build_loop_info(c, &loop, inst)) { + try_unroll_loop(c, &loop, prog_inst_limit); } } - ptr = ptr->Next; } } -void rc_emulate_loops(struct emulate_loop_state *s, - unsigned int max_instructions) +void rc_emulate_loops(struct emulate_loop_state *s, int prog_inst_limit) { int i; /* Iterate backwards of the list of loops so that loops that nested @@ -444,8 +516,8 @@ void rc_emulate_loops(struct emulate_loop_state *s, if(!s->Loops[i].EndLoop){ continue; } - unsigned int iterations = loop_calc_iterations(s, &s->Loops[i], - max_instructions); - loop_unroll(s, &s->Loops[i], iterations); + unsigned int iterations = loop_max_possible_iterations( + s->C, &s->Loops[i], prog_inst_limit); + unroll_loop(s->C, &s->Loops[i], iterations); } } diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h index 7748813c4eb..bba1f68e308 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h @@ -21,12 +21,14 @@ struct emulate_loop_state { struct loop_info * Loops; unsigned int LoopCount; unsigned int LoopReserved; + int prog_inst_limit; }; -void rc_transform_unroll_loops(struct radeon_compiler *c, - struct emulate_loop_state * s); +void rc_transform_loops(struct radeon_compiler *c, + struct emulate_loop_state * s, int prog_inst_limit); -void rc_emulate_loops(struct emulate_loop_state *s, - unsigned int max_instructions); +void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit); + +void rc_emulate_loops(struct emulate_loop_state * s, int prog_inst_limit); #endif /* RADEON_EMULATE_LOOPS_H */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c index 04f234f11d8..2ea830be7f9 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c @@ -386,8 +386,8 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .NumSrcRegs = 0, }, { - .Opcode = RC_OPCODE_CONTINUE, - .Name = "CONTINUE", + .Opcode = RC_OPCODE_CONT, + .Name = "CONT", .IsFlowControl = 1, .NumSrcRegs = 0 }, diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h index 8b9fa07dde2..6e18d6eb3f1 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h @@ -187,7 +187,7 @@ typedef enum { RC_OPCODE_ENDLOOP, - RC_OPCODE_CONTINUE, + RC_OPCODE_CONT, /** special instruction, used in R300-R500 fragment program pair instructions * indicates that the result of the alpha operation shall be replicated diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c index eca06515367..7a3f35950a6 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c @@ -164,7 +164,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo inst = inst->Next) { /* XXX In the future we might be able to make the optimizer * smart enough to handle loops. */ - if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){ + if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP + || inst->U.I.Opcode == RC_OPCODE_ENDLOOP){ return; } rc_for_all_reads_mask(inst, peephole_scan_read, &s); diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c index 8a912da4613..ce72cd97ab2 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c @@ -65,6 +65,11 @@ struct regalloc_state { struct hardware_register * HwTemporary; unsigned int NumHwTemporaries; + /** + * If an instruction is inside of a loop, end_loop will be the + * IP of the ENDLOOP instruction, otherwise end_loop will be 0 + */ + int end_loop; }; static void print_live_intervals(struct live_intervals * src) @@ -178,10 +183,10 @@ static void scan_callback(void * data, struct rc_instruction * inst, else reg->Live.Start = inst->IP; reg->Live.End = inst->IP; - } else { - if (inst->IP > reg->Live.End) - reg->Live.End = inst->IP; - } + } else if (s->end_loop) + reg->Live.End = s->end_loop; + else if (inst->IP > reg->Live.End) + reg->Live.End = inst->IP; } static void compute_live_intervals(struct regalloc_state * s) @@ -191,6 +196,31 @@ static void compute_live_intervals(struct regalloc_state * s) for(struct rc_instruction * inst = s->C->Program.Instructions.Next; inst != &s->C->Program.Instructions; inst = inst->Next) { + + /* For all instructions inside of a loop, the ENDLOOP + * instruction is used as the end of the live interval. */ + if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && !s->end_loop) { + int loops = 1; + struct rc_instruction * tmp; + for(tmp = inst->Next; + tmp != &s->C->Program.Instructions; + tmp = tmp->Next) { + if (tmp->U.I.Opcode == RC_OPCODE_BGNLOOP) { + loops++; + break; + } else if (tmp->U.I.Opcode + == RC_OPCODE_ENDLOOP) { + if(!--loops) { + s->end_loop = tmp->IP; + break; + } + } + } + } + + if (inst->IP == s->end_loop) + s->end_loop = 0; + rc_for_all_reads_mask(inst, scan_callback, s); rc_for_all_writes_mask(inst, scan_callback, s); } diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c index 3cc28972934..857aae55145 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c @@ -988,17 +988,22 @@ void radeonTransformKILP(struct radeon_compiler * c) for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) { - if (inst->U.I.Opcode != RC_OPCODE_KILP - || inst->Prev->U.I.Opcode != RC_OPCODE_IF - || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) { + if (inst->U.I.Opcode != RC_OPCODE_KILP) continue; - } + inst->U.I.Opcode = RC_OPCODE_KIL; - inst->U.I.SrcReg[0] = negate(absolute(inst->Prev->U.I.SrcReg[0])); - /* Remove IF */ - rc_remove_instruction(inst->Prev); - /* Remove ENDIF */ - rc_remove_instruction(inst->Next); + if (inst->Prev->U.I.Opcode != RC_OPCODE_IF + || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) { + inst->U.I.SrcReg[0] = negate(builtin_one); + } else { + + inst->U.I.SrcReg[0] = + negate(absolute(inst->Prev->U.I.SrcReg[0])); + /* Remove IF */ + rc_remove_instruction(inst->Prev); + /* Remove ENDIF */ + rc_remove_instruction(inst->Next); + } } } diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c index e4b302bbad9..3d2f8928fa6 100644 --- a/src/mesa/drivers/dri/r300/r300_context.c +++ b/src/mesa/drivers/dri/r300/r300_context.c @@ -461,7 +461,7 @@ static void r300InitGLExtensions(GLcontext *ctx) if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) { _mesa_disable_extension(ctx, "GL_ARB_occlusion_query"); } - if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV350) + if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_R420) _mesa_enable_extension(ctx, "GL_ARB_half_float_vertex"); if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index f25264b6f2d..f7705b0f6fe 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #define R300_VAP_GB_HORZ_CLIP_ADJ 0x2228 #define R300_VAP_GB_HORZ_DISC_ADJ 0x222c +#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0 0x2230 +#define R300_PVS_FC_ACT_ADRS(x) ((x) << 0) +#define R300_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 8) +#define R300_PVS_FC_LAST_INST(x) ((x) << 16) +#define R300_PVS_FC_RTN_INST(x) ((x) << 24) + /* gap */ /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between @@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_2288_R300 0x00750000 /* -- nh */ # define R300_2288_RV350 0x0000FFFF /* -- Vladimir */ +#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290 +#define R300_PVS_FC_LOOP_INIT_VAL(x) ((x) << 0) +#define R300_PVS_FC_LOOP_STEP_VAL(x) ((x) << 8) + /* gap */ /* Addresses are relative to the vertex program instruction area of the @@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #define R300_VAP_PVS_CODE_CNTL_1 0x22D8 # define R300_PVS_LAST_VTX_SRC_INST_SHIFT 0 #define R300_VAP_PVS_FLOW_CNTL_OPC 0x22DC +#define R300_VAP_PVS_FC_OPC_JUMP(x) (1 << (2 * (x))) +#define R300_VAP_PVS_FC_OPC_LOOP(x) (2 << (2 * (x))) +#define R300_VAP_PVS_FC_OPC_JSR(x) (3 << (2 * (x))) /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for * immediate vertices @@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. /* write 0 to indicate end of packet? */ #define R300_VAP_VTX_END_OF_PKT 0x24AC +#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0 0x2500 +#define R500_PVS_FC_ACT_ADRS(x) ((x) << 0) +#define R500_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 16) + +#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0 0x2504 +#define R500_PVS_FC_LAST_INST(x) ((x) << 0) +#define R500_PVS_FC_RTN_INST(x) ((x) << 16) + /* gap */ /* These are values from r300_reg/r300_reg.h - they are known to be correct diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c index bb8f91491f5..cf89ab7ec3d 100644 --- a/src/mesa/drivers/dri/r300/r300_render.c +++ b/src/mesa/drivers/dri/r300/r300_render.c @@ -327,6 +327,8 @@ void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim) BATCH_LOCALS(&rmesa->radeon); int type, num_verts; + radeon_prepare_render(&rmesa->radeon); + type = r300PrimitiveType(rmesa, prim); num_verts = r300NumVerts(rmesa, end - start, prim); diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c index 4ba6740e3d9..94588698265 100644 --- a/src/mesa/drivers/dri/r300/r300_texstate.c +++ b/src/mesa/drivers/dri/r300/r300_texstate.c @@ -152,8 +152,8 @@ int32_t r300TranslateTexFormat(gl_format mesaFormat) case MESA_FORMAT_Z32: return R300_EASY_TX_FORMAT(X, X, X, X, X32); /* EXT_texture_sRGB */ - case MESA_FORMAT_SRGBA8: - return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA; + case MESA_FORMAT_SARGB8: + return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA; case MESA_FORMAT_SLA8: return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA; case MESA_FORMAT_SL8: diff --git a/src/mesa/drivers/dri/r600/r600_blit.c b/src/mesa/drivers/dri/r600/r600_blit.c index 172f85eb264..27acff9c166 100644 --- a/src/mesa/drivers/dri/r600/r600_blit.c +++ b/src/mesa/drivers/dri/r600/r600_blit.c @@ -72,7 +72,7 @@ unsigned r600_check_blit(gl_format mesa_format) case MESA_FORMAT_Z24_S8: case MESA_FORMAT_Z16: case MESA_FORMAT_Z32: - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: case MESA_FORMAT_SLA8: case MESA_FORMAT_SL8: break; @@ -320,9 +320,9 @@ set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_forma CLEARbit(cb_color0_info, SOURCE_FORMAT_bit); SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask); break; - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: format = COLOR_8_8_8_8; - comp_swap = SWAP_STD_REV; + comp_swap = SWAP_ALT; SETbit(cb_color0_info, SOURCE_FORMAT_bit); SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask); break; @@ -390,13 +390,20 @@ set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_forma 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0); END_BATCH(); - BEGIN_BATCH_NO_AUTOSTATE(12); + BEGIN_BATCH_NO_AUTOSTATE(9); R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), cb_color0_size); R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), cb_color0_view); - R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), cb_color0_info); R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), 0); END_BATCH(); + BEGIN_BATCH_NO_AUTOSTATE(3 + 2); + R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), cb_color0_info); + R600_OUT_BATCH_RELOC(0, + bo, + 0, + 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0); + END_BATCH(); + COMMIT_BATCH(); } @@ -1043,17 +1050,17 @@ set_tex_resource(context_t * context, SETfield(sq_tex_resource4, SQ_SEL_X, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask); break; - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: SETfield(sq_tex_resource1, FMT_8_8_8_8, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask); - SETfield(sq_tex_resource4, SQ_SEL_W, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask); SETfield(sq_tex_resource4, SQ_SEL_Z, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask); + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask); SETfield(sq_tex_resource4, SQ_SEL_Y, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask); + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask); SETfield(sq_tex_resource4, SQ_SEL_X, + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask); + SETfield(sq_tex_resource4, SQ_SEL_W, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask); SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit); break; @@ -1477,7 +1484,6 @@ set_default_state(context_t *context) (CLRCMP_SEL_SRC << CLRCMP_FCN_SEL_shift)); R600_OUT_BATCH_REGVAL(SQ_VTX_BASE_VTX_LOC, 0); R600_OUT_BATCH_REGVAL(SQ_VTX_START_INST_LOC, 0); - R600_OUT_BATCH_REGVAL(DB_DEPTH_INFO, 0); R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, 0); R600_OUT_BATCH_REGVAL(CB_SHADER_MASK, (OUTPUT0_ENABLE_mask)); R600_OUT_BATCH_REGVAL(CB_TARGET_MASK, (TARGET0_ENABLE_mask)); @@ -1526,6 +1532,7 @@ set_default_state(context_t *context) R600_OUT_BATCH(0); R600_OUT_BATCH_REGVAL(VGT_STRMOUT_BUFFER_EN, 0); + R600_OUT_BATCH_REGVAL(SX_ALPHA_TEST_CONTROL, 0); END_BATCH(); COMMIT_BATCH(); @@ -1607,7 +1614,7 @@ unsigned r600_blit(GLcontext *ctx, /* Flush is needed to make sure that source buffer has correct data */ radeonFlush(ctx); - rcommonEnsureCmdBufSpace(&context->radeon, 304, __FUNCTION__); + rcommonEnsureCmdBufSpace(&context->radeon, 308, __FUNCTION__); /* load shaders */ load_shaders(context->radeon.glCtx); @@ -1632,7 +1639,7 @@ unsigned r600_blit(GLcontext *ctx, set_tex_sampler(context); /* dst */ - /* 27 */ + /* 31 */ set_render_target(context, dst_bo, dst_mesaformat, dst_pitch, dst_width, dst_height, dst_offset); /* scissors */ diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c index 84d9d423124..389b0412baa 100644 --- a/src/mesa/drivers/dri/r600/r600_context.c +++ b/src/mesa/drivers/dri/r600/r600_context.c @@ -72,6 +72,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define R600_ENABLE_GLSL_TEST 1 #define need_GL_VERSION_2_0 +#define need_GL_VERSION_2_1 +#define need_GL_ARB_draw_elements_base_vertex #define need_GL_ARB_occlusion_query #define need_GL_ARB_point_parameters #define need_GL_ARB_vertex_program @@ -140,6 +142,7 @@ static const struct dri_extension card_extensions[] = { {"GL_NV_vertex_program", GL_NV_vertex_program_functions}, {"GL_SGIS_generate_mipmap", NULL}, {"GL_ARB_pixel_buffer_object", NULL}, + {"GL_ARB_draw_elements_base_vertex", GL_ARB_draw_elements_base_vertex_functions }, {NULL, NULL} /* *INDENT-ON* */ }; @@ -157,6 +160,7 @@ static const struct dri_extension mm_extensions[] = { static const struct dri_extension gl_20_extension[] = { #ifdef R600_ENABLE_GLSL_TEST {"GL_ARB_shading_language_100", GL_VERSION_2_0_functions }, + {"GL_ARB_shading_language_120", GL_VERSION_2_1_functions }, #else {"GL_VERSION_2_0", GL_VERSION_2_0_functions }, #endif /* R600_ENABLE_GLSL_TEST */ diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c index 41419f84601..512a52ede3e 100644 --- a/src/mesa/drivers/dri/r600/r600_tex.c +++ b/src/mesa/drivers/dri/r600/r600_tex.c @@ -431,7 +431,7 @@ unsigned r600IsFormatRenderable(gl_format mesa_format) case MESA_FORMAT_Z24_S8: case MESA_FORMAT_Z16: case MESA_FORMAT_Z32: - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: case MESA_FORMAT_SLA8: case MESA_FORMAT_SL8: return 1; diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c index 1600033b9bd..ba3690b70ed 100644 --- a/src/mesa/drivers/dri/r600/r600_texstate.c +++ b/src/mesa/drivers/dri/r600/r600_texstate.c @@ -605,17 +605,17 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa } break; /* EXT_texture_sRGB */ - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask); - SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask); SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask); + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask); SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y, - SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask); + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask); SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X, + SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask); + SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask); SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit); break; diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c index 99a33df4fcb..9c954cbf70c 100644 --- a/src/mesa/drivers/dri/r600/r700_assembler.c +++ b/src/mesa/drivers/dri/r600/r700_assembler.c @@ -275,7 +275,10 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size) case 2: format = FMT_8_8; break; case 3: - format = FMT_8_8_8; break; + /* for some (small/unaligned) strides using 4 comps works + * better, probably same as GL_SHORT below + * test piglit/draw-vertices */ + format = FMT_8_8_8_8; break; case 4: format = FMT_8_8_8_8; break; default: @@ -2872,25 +2875,92 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm) GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode) { + /* + * r600 - trunc to -PI..PI range + * r700 - normalize by dividing by 2PI + * see fdo bug 27901 + */ + int tmp; checkop1(pAsm); tmp = gethelpr(pAsm); - pAsm->D.dst.opcode = SQ_OP2_INST_MUL; + pAsm->D.dst.opcode = SQ_OP3_INST_MULADD; + pAsm->D.dst.op3 = 1; + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); pAsm->D.dst.rtype = DST_REG_TEMPORARY; pAsm->D.dst.reg = tmp; - pAsm->D.dst.writex = 1; assemble_src(pAsm, 0, -1); pAsm->S[1].src.rtype = SRC_REC_LITERAL; setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X); + + pAsm->S[2].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y); + pAsm->D2.dst2.literal_slots = 1; pAsm->C[0].f = 1/(3.1415926535 * 2); - pAsm->C[1].f = 0.0F; - next_ins(pAsm); + pAsm->C[1].f = 0.5f; + + if ( GL_FALSE == next_ins(pAsm) ) + { + return GL_FALSE; + } + + pAsm->D.dst.opcode = SQ_OP2_INST_FRACT; + + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); + pAsm->D.dst.rtype = DST_REG_TEMPORARY; + pAsm->D.dst.reg = tmp; + pAsm->D.dst.writex = 1; + + setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE); + pAsm->S[0].src.rtype = SRC_REG_TEMPORARY; + pAsm->S[0].src.reg = tmp; + setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X); + + if(( GL_FALSE == next_ins(pAsm) )) + { + return GL_FALSE; + } + pAsm->D.dst.opcode = SQ_OP3_INST_MULADD; + pAsm->D.dst.op3 = 1; + + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); + pAsm->D.dst.rtype = DST_REG_TEMPORARY; + pAsm->D.dst.reg = tmp; + + setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE); + pAsm->S[0].src.rtype = SRC_REG_TEMPORARY; + pAsm->S[0].src.reg = tmp; + setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X); + + pAsm->S[1].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X); + + pAsm->S[2].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y); + + pAsm->D2.dst2.literal_slots = 1; + + if (pAsm->bR6xx) + { + pAsm->C[0].f = 3.1415926535897f * 2.0f; + pAsm->C[1].f = -3.1415926535897f; + } + else + { + pAsm->C[0].f = 1.0f; + pAsm->C[1].f = -0.5f; + } + + if(( GL_FALSE == next_ins(pAsm) )) + { + return GL_FALSE; + } pAsm->D.dst.opcode = opcode; pAsm->D.dst.math = 1; @@ -4030,22 +4100,79 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm) checkop1(pAsm); tmp = gethelpr(pAsm); - /* tmp.x = src /2*PI */ - pAsm->D.dst.opcode = SQ_OP2_INST_MUL; + + pAsm->D.dst.opcode = SQ_OP3_INST_MULADD; + pAsm->D.dst.op3 = 1; + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); pAsm->D.dst.rtype = DST_REG_TEMPORARY; pAsm->D.dst.reg = tmp; - pAsm->D.dst.writex = 1; assemble_src(pAsm, 0, -1); pAsm->S[1].src.rtype = SRC_REC_LITERAL; setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X); + + pAsm->S[2].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y); + pAsm->D2.dst2.literal_slots = 1; pAsm->C[0].f = 1/(3.1415926535 * 2); - pAsm->C[1].f = 0.0F; + pAsm->C[1].f = 0.5F; - next_ins(pAsm); + if ( GL_FALSE == next_ins(pAsm) ) + { + return GL_FALSE; + } + + pAsm->D.dst.opcode = SQ_OP2_INST_FRACT; + + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); + pAsm->D.dst.rtype = DST_REG_TEMPORARY; + pAsm->D.dst.reg = tmp; + pAsm->D.dst.writex = 1; + + setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE); + pAsm->S[0].src.rtype = SRC_REG_TEMPORARY; + pAsm->S[0].src.reg = tmp; + setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X); + + if(( GL_FALSE == next_ins(pAsm) )) + { + return GL_FALSE; + } + pAsm->D.dst.opcode = SQ_OP3_INST_MULADD; + pAsm->D.dst.op3 = 1; + + setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE); + pAsm->D.dst.rtype = DST_REG_TEMPORARY; + pAsm->D.dst.reg = tmp; + + setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE); + pAsm->S[0].src.rtype = SRC_REG_TEMPORARY; + pAsm->S[0].src.reg = tmp; + setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X); + + pAsm->S[1].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X); + + pAsm->S[2].src.rtype = SRC_REC_LITERAL; + setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y); + + pAsm->D2.dst2.literal_slots = 1; + + if(pAsm->bR6xx) { + pAsm->C[0].f = 3.1415926535897f * 2.0f; + pAsm->C[1].f = -3.1415926535897f; + } else { + pAsm->C[0].f = 1.0f; + pAsm->C[1].f = -0.5f; + } + + if(( GL_FALSE == next_ins(pAsm) )) + { + return GL_FALSE; + } // COS dst.x, a.x pAsm->D.dst.opcode = SQ_OP2_INST_COS; @@ -6473,7 +6600,7 @@ GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode, * results are undefined anyway */ if(export_count == 0) { - Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, 0, GL_FALSE); + Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, pR700AsmCode->starting_export_register_number, GL_FALSE); } if(pR700AsmCode->cf_last_export_ptr != NULL) diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c index cefda3ac4ba..bf8063391a2 100644 --- a/src/mesa/drivers/dri/r600/r700_chip.c +++ b/src/mesa/drivers/dri/r600/r700_chip.c @@ -265,17 +265,6 @@ static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom) if (context->radeon.tcl.aos_count == 0) return; - BEGIN_BATCH_NO_AUTOSTATE(6); - R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1)); - R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX); - R600_OUT_BATCH(0); - - R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1)); - R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX); - R600_OUT_BATCH(0); - END_BATCH(); - COMMIT_BATCH(); - for(i=0; i<VERT_ATTRIB_MAX; i++) { if(vp->mesa_program->Base.InputsRead & (1 << i)) { @@ -523,9 +512,9 @@ static void r700SetRenderTarget(context_t *context, int id) CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask); CLEARbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit); break; - case MESA_FORMAT_SRGBA8: + case MESA_FORMAT_SARGB8: format = COLOR_8_8_8_8; - comp_swap = SWAP_STD_REV; + comp_swap = SWAP_ALT; number_type = NUMBER_SRGB; SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit); break; @@ -617,18 +606,25 @@ static void r700SendDepthTargetState(GLcontext *ctx, struct radeon_state_atom *a r700SetDepthTarget(context); - BEGIN_BATCH_NO_AUTOSTATE(8 + 2); + BEGIN_BATCH_NO_AUTOSTATE(7 + 2); R600_OUT_BATCH_REGSEQ(DB_DEPTH_SIZE, 2); R600_OUT_BATCH(r700->DB_DEPTH_SIZE.u32All); R600_OUT_BATCH(r700->DB_DEPTH_VIEW.u32All); - R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 2); + R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 1); R600_OUT_BATCH(r700->DB_DEPTH_BASE.u32All); - R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All); R600_OUT_BATCH_RELOC(r700->DB_DEPTH_BASE.u32All, rrb->bo, r700->DB_DEPTH_BASE.u32All, 0, RADEON_GEM_DOMAIN_VRAM, 0); END_BATCH(); + BEGIN_BATCH_NO_AUTOSTATE(3 + 2); + R600_OUT_BATCH_REGSEQ(DB_DEPTH_INFO, 1); + R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All); + R600_OUT_BATCH_RELOC(r700->DB_DEPTH_INFO.u32All, + rrb->bo, + r700->DB_DEPTH_INFO.u32All, + 0, RADEON_GEM_DOMAIN_VRAM, 0); + END_BATCH(); if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) && (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) { @@ -687,27 +683,35 @@ static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom * BEGIN_BATCH_NO_AUTOSTATE(3 + 2); R600_OUT_BATCH_REGSEQ(CB_COLOR0_TILE + (4 * id), 1); R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_TILE.u32All); - R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All, + R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_TILE.u32All, rrb->bo, - r700->render_target[id].CB_COLOR0_BASE.u32All, + r700->render_target[id].CB_COLOR0_TILE.u32All, 0, RADEON_GEM_DOMAIN_VRAM, 0); END_BATCH(); BEGIN_BATCH_NO_AUTOSTATE(3 + 2); R600_OUT_BATCH_REGSEQ(CB_COLOR0_FRAG + (4 * id), 1); R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_FRAG.u32All); - R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All, + R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_FRAG.u32All, rrb->bo, - r700->render_target[id].CB_COLOR0_BASE.u32All, + r700->render_target[id].CB_COLOR0_FRAG.u32All, 0, RADEON_GEM_DOMAIN_VRAM, 0); END_BATCH(); - BEGIN_BATCH_NO_AUTOSTATE(12); + BEGIN_BATCH_NO_AUTOSTATE(9); R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), r700->render_target[id].CB_COLOR0_SIZE.u32All); R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), r700->render_target[id].CB_COLOR0_VIEW.u32All); - R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All); R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All); END_BATCH(); + BEGIN_BATCH_NO_AUTOSTATE(3 + 2); + R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All); + R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_INFO.u32All, + rrb->bo, + r700->render_target[id].CB_COLOR0_INFO.u32All, + 0, RADEON_GEM_DOMAIN_VRAM, 0); + + END_BATCH(); + COMMIT_BATCH(); } @@ -1465,9 +1469,6 @@ static int check_vtx(GLcontext *ctx, struct radeon_state_atom *atom) context_t *context = R700_CONTEXT(ctx); int count = context->radeon.tcl.aos_count * 18; - if (count) - count += 6; - radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count); return count; } @@ -1567,7 +1568,7 @@ void r600InitAtoms(context_t *context) ALLOC_STATE(sq, always, 34, r700SendSQConfig); ALLOC_STATE(db, always, 17, r700SendDBState); ALLOC_STATE(stencil, always, 4, r700SendStencilState); - ALLOC_STATE(db_target, always, 12, r700SendDepthTargetState); + ALLOC_STATE(db_target, always, 16, r700SendDepthTargetState); ALLOC_STATE(sc, always, 15, r700SendSCState); ALLOC_STATE(scissor, always, 22, r700SendScissorState); ALLOC_STATE(aa, always, 12, r700SendAAState); @@ -1578,7 +1579,7 @@ void r600InitAtoms(context_t *context) ALLOC_STATE(poly, always, 10, r700SendPolyState); ALLOC_STATE(cb, cb, 18, r700SendCBState); ALLOC_STATE(clrcmp, always, 6, r700SendCBCLRCMPState); - ALLOC_STATE(cb_target, always, 29, r700SendRenderTargetState); + ALLOC_STATE(cb_target, always, 31, r700SendRenderTargetState); ALLOC_STATE(blnd, blnd, (6 + (R700_MAX_RENDER_TARGETS * 3)), r700SendCBBlendState); ALLOC_STATE(blnd_clr, always, 6, r700SendCBBlendColorState); ALLOC_STATE(sx, always, 9, r700SendSXState); @@ -1590,7 +1591,7 @@ void r600InitAtoms(context_t *context) ALLOC_STATE(ps, always, 24, r700SendPSState); ALLOC_STATE(vs_consts, vs_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendVSConsts); ALLOC_STATE(ps_consts, ps_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendPSConsts); - ALLOC_STATE(vtx, vtx, (6 + (VERT_ATTRIB_MAX * 18)), r700SendVTXState); + ALLOC_STATE(vtx, vtx, (VERT_ATTRIB_MAX * 18), r700SendVTXState); ALLOC_STATE(tx, tx, (R700_TEXTURE_NUMBERUNITS * 20), r700SendTexState); ALLOC_STATE(tx_smplr, tx, (R700_TEXTURE_NUMBERUNITS * 5), r700SendTexSamplerState); ALLOC_STATE(tx_brdr_clr, tx, (R700_TEXTURE_NUMBERUNITS * 6), r700SendTexBorderColorState); diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c index 09c48565b68..d1008f28b9b 100644 --- a/src/mesa/drivers/dri/r600/r700_clear.c +++ b/src/mesa/drivers/dri/r600/r700_clear.c @@ -48,6 +48,7 @@ static GLboolean r700ClearFast(context_t *context, GLbitfield mask) void r700Clear(GLcontext * ctx, GLbitfield mask) { context_t *context = R700_CONTEXT(ctx); + radeonContextPtr radeon = &context->radeon; __DRIdrawable *dPriv = radeon_get_drawable(&context->radeon); const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask[0]); GLbitfield swrast_mask = 0, tri_mask = 0; @@ -60,6 +61,8 @@ void r700Clear(GLcontext * ctx, GLbitfield mask) context->radeon.front_buffer_dirty = GL_TRUE; } + radeon_prepare_render(radeon); + if( GL_TRUE == r700ClearFast(context, mask) ) { return; diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c index 1929b7cc129..c5771f9fd0b 100644 --- a/src/mesa/drivers/dri/r600/r700_render.c +++ b/src/mesa/drivers/dri/r600/r700_render.c @@ -244,7 +244,8 @@ static int r700NumVerts(int num_verts, int prim) return num_verts - verts_off; } -static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim) +static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, + int prim, GLint basevertex) { context_t *context = R700_CONTEXT(ctx); BATCH_LOCALS(&context->radeon); @@ -282,6 +283,7 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim total_emit = 3 /* VGT_PRIMITIVE_TYPE */ + 2 /* VGT_INDEX_TYPE */ + 2 /* NUM_INSTANCES */ + + 4 /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */ + 5 + 2; /* DRAW_INDEX */ BEGIN_BATCH_NO_AUTOSTATE(total_emit); @@ -294,6 +296,11 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim // num instances R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0)); R600_OUT_BATCH(1); + /* offset */ + R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2)); + R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX); + R600_OUT_BATCH(basevertex); //VTX_BASE_VTX_LOC + R600_OUT_BATCH(0); //VTX_START_INST_LOC // draw packet R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX, 3)); R600_OUT_BATCH(context->ind_buf.bo_offset); @@ -364,6 +371,7 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end, total_emit += 3 /* VGT_PRIMITIVE_TYPE */ + 2 /* VGT_INDEX_TYPE */ + 2 /* NUM_INSTANCES */ + + 4 /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */ + 3; /* DRAW */ BEGIN_BATCH_NO_AUTOSTATE(total_emit); @@ -376,6 +384,11 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end, // num instances R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0)); R600_OUT_BATCH(1); + /* offset */ + R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2)); + R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX); + R600_OUT_BATCH(0); //VTX_BASE_VTX_LOC + R600_OUT_BATCH(0); //VTX_START_INST_LOC // draw packet if(start == 0) { @@ -433,16 +446,16 @@ static GLuint r700PredictRenderSize(GLcontext* ctx, dwords = PRE_EMIT_STATE_BUFSZ; if (ib) - dwords += nr_prims * 14; + dwords += nr_prims * 18; else { for (i = 0; i < nr_prims; ++i) { if (prim[i].start == 0) - dwords += 10; + dwords += 14; else if (prim[i].count > 0xffff) - dwords += prim[i].count + 10; + dwords += prim[i].count + 14; else - dwords += ((prim[i].count + 1) / 2) + 10; + dwords += ((prim[i].count + 1) / 2) + 14; } } @@ -625,11 +638,11 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input stride = (input[i]->StrideB == 0) ? getTypeSize(input[i]->Type) * input[i]->Size : input[i]->StrideB; - if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT || + if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT #if MESA_BIG_ENDIAN - getTypeSize(input[i]->Type) != 4 || + || getTypeSize(input[i]->Type) != 4 #endif - stride < 4) + ) { r700ConvertAttrib(ctx, count, input[i], &context->stream_desc[index]); } @@ -637,19 +650,10 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input { if (input[i]->BufferObj->Name) { - if (stride % 4 != 0) - { - assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0); - r700AlignDataToDword(ctx, input[i], count, &context->stream_desc[index]); - context->stream_desc[index].is_named_bo = GL_FALSE; - } - else - { - context->stream_desc[index].stride = input[i]->StrideB; - context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr; - context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo; - context->stream_desc[index].is_named_bo = GL_TRUE; - } + context->stream_desc[index].stride = input[i]->StrideB; + context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr; + context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo; + context->stream_desc[index].is_named_bo = GL_TRUE; } else { @@ -932,7 +936,8 @@ static GLboolean r700TryDrawPrims(GLcontext *ctx, r700RunRenderPrimitive(ctx, prim[i].start, prim[i].start + prim[i].count, - prim[i].mode); + prim[i].mode, + prim[i].basevertex); else r700RunRenderPrimitiveImmediate(ctx, prim[i].start, @@ -977,18 +982,24 @@ static void r700DrawPrims(GLcontext *ctx, { GLboolean retval = GL_FALSE; + context_t *context = R700_CONTEXT(ctx); + radeonContextPtr radeon = &context->radeon; + radeon_prepare_render(radeon); + /* This check should get folded into just the places that * min/max index are really needed. */ - if (!index_bounds_valid) { - vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index); - } - if (min_index) { + if (!vbo_all_varyings_in_vbos(arrays)) { + if (!index_bounds_valid) + vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index); + /* do we want to rebase, minimizes the + * amount of data to upload? */ + if (min_index) { vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r700DrawPrims ); return; + } } - /* Make an attempt at drawing */ retval = r700TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index); diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c index 137f3007ced..6a2a09eaf1a 100644 --- a/src/mesa/drivers/dri/r600/r700_vertprog.c +++ b/src/mesa/drivers/dri/r600/r700_vertprog.c @@ -461,11 +461,11 @@ static void r700TranslateAttrib(GLcontext *ctx, GLuint unLoc, int count, const s stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB; - if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT || + if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT #if MESA_BIG_ENDIAN - getTypeSize(input->Type) != 4 || + || getTypeSize(input->Type) != 4 #endif - stride < 4) + ) { pStreamDesc->type = GL_FLOAT; diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h index b7ee9a134bf..7d54fabebbc 100644 --- a/src/mesa/drivers/dri/radeon/radeon_chipset.h +++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h @@ -414,9 +414,9 @@ enum { CHIP_FAMILY_R350, CHIP_FAMILY_RV350, CHIP_FAMILY_RV380, + CHIP_FAMILY_RS400, CHIP_FAMILY_R420, CHIP_FAMILY_RV410, - CHIP_FAMILY_RS400, CHIP_FAMILY_RS600, CHIP_FAMILY_RS690, CHIP_FAMILY_RS740, diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c index 13f1f0611b8..c1a660af3d0 100644 --- a/src/mesa/drivers/dri/radeon/radeon_common.c +++ b/src/mesa/drivers/dri/radeon/radeon_common.c @@ -708,7 +708,6 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb) if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) { rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer); radeon->front_cliprects = GL_TRUE; - radeon->front_buffer_dirty = GL_TRUE; } else { rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer); radeon->front_cliprects = GL_FALSE; @@ -1132,17 +1131,13 @@ flush_front: if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2) && (screen->dri2.loader->flushFrontBuffer != NULL)) { __DRIdrawable * drawable = radeon_get_drawable(radeon); - (*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate); - /* Only clear the dirty bit if front-buffer rendering is no longer - * enabled. This is done so that the dirty bit can only be set in - * glDrawBuffer. Otherwise the dirty bit would have to be set at - * each of N places that do rendering. This has worse performances, - * but it is much easier to get correct. + /* We set the dirty bit in radeon_prepare_render() if we're + * front buffer rendering once we get there. */ - if (!radeon->is_front_buffer_rendering) { - radeon->front_buffer_dirty = GL_FALSE; - } + radeon->front_buffer_dirty = GL_FALSE; + + (*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate); } } } diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c index 5a7d52c4d2f..92663bf66d7 100644 --- a/src/mesa/drivers/dri/radeon/radeon_common_context.c +++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c @@ -493,6 +493,50 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb) return _mesa_get_format_bytes(rb->base.Format) * 8; } +/* + * Check if drawable has been invalidated by dri2InvalidateDrawable(). + * Update renderbuffers if so. This prevents a client from accessing + * a backbuffer that has a swap pending but not yet completed. + * + * See intel_prepare_render for equivalent code in intel driver. + * + */ +void radeon_prepare_render(radeonContextPtr radeon) +{ + __DRIcontext *driContext = radeon->dri.context; + __DRIdrawable *drawable; + __DRIscreen *screen; + + screen = driContext->driScreenPriv; + if (!screen->dri2.loader) + return; + + drawable = driContext->driDrawablePriv; + if (drawable->dri2.stamp != driContext->dri2.draw_stamp) { + if (drawable->lastStamp != drawable->dri2.stamp) + radeon_update_renderbuffers(driContext, drawable, GL_FALSE); + + /* Intel driver does the equivalent of this, no clue if it is needed: + * radeon_draw_buffer(radeon->glCtx, &(drawable->driverPrivate)->base); + */ + driContext->dri2.draw_stamp = drawable->dri2.stamp; + } + + drawable = driContext->driReadablePriv; + if (drawable->dri2.stamp != driContext->dri2.read_stamp) { + if (drawable->lastStamp != drawable->dri2.stamp) + radeon_update_renderbuffers(driContext, drawable, GL_FALSE); + driContext->dri2.read_stamp = drawable->dri2.stamp; + } + + /* If we're currently rendering to the front buffer, the rendering + * that will happen next will probably dirty the front buffer. So + * mark it as dirty here. + */ + if (radeon->is_front_buffer_rendering) + radeon->front_buffer_dirty = GL_TRUE; +} + void radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable, GLboolean front_only) @@ -514,6 +558,11 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable, screen = context->driScreenPriv; radeon = (radeonContextPtr) context->driverPrivate; + /* Set this up front, so that in case our buffers get invalidated + * while we're getting new buffers, we don't clobber the stamp and + * thus ignore the invalidate. */ + drawable->lastStamp = drawable->dri2.stamp; + if (screen->dri2.loader && (screen->dri2.loader->base.version > 2) && (screen->dri2.loader->getBuffersWithFormat != NULL)) { @@ -650,6 +699,13 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable, rb->base.Height = drawable->h; rb->has_surface = 0; + /* r6xx+ tiling */ + rb->tile_config = radeon->radeonScreen->tile_config; + rb->group_bytes = radeon->radeonScreen->group_bytes; + rb->num_channels = radeon->radeonScreen->num_channels; + rb->num_banks = radeon->radeonScreen->num_banks; + rb->r7xx_bank_op = radeon->radeonScreen->r7xx_bank_op; + if (buffers[i].attachment == __DRI_BUFFER_STENCIL && depth_bo) { if (RADEON_DEBUG & RADEON_DRI) fprintf(stderr, "(reusing depth buffer as stencil)\n"); @@ -678,7 +734,7 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable, bo->flags |= RADEON_BO_FLAGS_MACRO_TILE; if (tiling_flags & RADEON_TILING_MICRO) bo->flags |= RADEON_BO_FLAGS_MICRO_TILE; - + } if (buffers[i].attachment == __DRI_BUFFER_DEPTH) { diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h index 5156c5d0d0a..f06e5fdf244 100644 --- a/src/mesa/drivers/dri/radeon/radeon_common_context.h +++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h @@ -93,6 +93,13 @@ struct radeon_renderbuffer GLuint pf_pending; /**< sequence number of pending flip */ GLuint vbl_pending; /**< vblank sequence number of pending flip */ __DRIdrawable *dPriv; + + /* r6xx+ tiling */ + GLuint tile_config; + GLint group_bytes; + GLint num_channels; + GLint num_banks; + GLint r7xx_bank_op; }; struct radeon_framebuffer @@ -614,5 +621,6 @@ GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv, __DRIdrawable * driDrawPriv, __DRIdrawable * driReadPriv); extern void radeonDestroyContext(__DRIcontext * driContextPriv); +void radeon_prepare_render(radeonContextPtr radeon); #endif diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c index c877e6c1765..c6e5f110ea3 100644 --- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c +++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c @@ -133,7 +133,7 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree height = _mesa_next_pow_two_32(lvl->height); lvl->rowstride = get_texture_image_row_stride(rmesa, mt->mesaFormat, lvl->width, mt->tilebits); - lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, lvl->height, lvl->depth, mt->tilebits); + lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, height, lvl->depth, mt->tilebits); assert(lvl->size > 0); diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c index dadb8002c7d..fb741173ca8 100644 --- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c +++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c @@ -179,6 +179,9 @@ radeonReadPixels(GLcontext * ctx, GLenum format, GLenum type, const struct gl_pixelstore_attrib *pack, GLvoid * pixels) { + radeonContextPtr radeon = RADEON_CONTEXT(ctx); + radeon_prepare_render(radeon); + if (do_blit_readpixels(ctx, x, y, width, height, format, type, pack, pixels)) return; diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c index 82107cc6aeb..fa97a19302c 100644 --- a/src/mesa/drivers/dri/radeon/radeon_screen.c +++ b/src/mesa/drivers/dri/radeon/radeon_screen.c @@ -213,6 +213,10 @@ static const GLuint __driNConfigOptions = 17; static int getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo ); +#ifndef RADEON_INFO_TILE_CONFIG +#define RADEON_INFO_TILE_CONFIG 0x6 +#endif + static int radeonGetParam(__DRIscreen *sPriv, int param, void *value) { @@ -232,6 +236,9 @@ radeonGetParam(__DRIscreen *sPriv, int param, void *value) case RADEON_PARAM_NUM_Z_PIPES: info.request = RADEON_INFO_NUM_Z_PIPES; break; + case RADEON_INFO_TILE_CONFIG: + info.request = RADEON_INFO_TILE_CONFIG; + break; default: return -EINVAL; } @@ -376,6 +383,21 @@ static const __DRItexBufferExtension r600TexBufferExtension = { }; #endif +static void +radeonDRI2Flush(__DRIdrawable *drawable) +{ + radeonContextPtr rmesa; + + rmesa = (radeonContextPtr) drawable->driContextPriv->driverPrivate; + radeonFlush(rmesa->glCtx); +} + +static const struct __DRI2flushExtensionRec radeonFlushExtension = { + { __DRI2_FLUSH, __DRI2_FLUSH_VERSION }, + radeonDRI2Flush, + dri2InvalidateDrawable, +}; + static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id) { screen->device_id = device_id; @@ -1305,6 +1327,56 @@ radeonCreateScreen2(__DRIscreen *sPriv) else screen->chip_flags |= RADEON_CLASS_R600; + /* r6xx+ tiling */ + if (IS_R600_CLASS(screen) && (sPriv->drm_version.minor >= 6)) { + ret = radeonGetParam(sPriv, RADEON_INFO_TILE_CONFIG, &temp); + if (ret) + fprintf(stderr, "failed to get tiling info\n"); + else { + screen->tile_config = temp; + screen->r7xx_bank_op = 0; + switch((screen->tile_config & 0xe) >> 1) { + case 0: + screen->num_channels = 1; + break; + case 1: + screen->num_channels = 2; + break; + case 2: + screen->num_channels = 4; + break; + case 3: + screen->num_channels = 8; + break; + default: + fprintf(stderr, "bad channels\n"); + break; + } + switch((screen->tile_config & 0x30) >> 4) { + case 0: + screen->num_banks = 4; + break; + case 1: + screen->num_banks = 8; + break; + default: + fprintf(stderr, "bad banks\n"); + break; + } + switch((screen->tile_config & 0xc0) >> 6) { + case 0: + screen->group_bytes = 256; + break; + case 1: + screen->group_bytes = 512; + break; + default: + fprintf(stderr, "bad group_bytes\n"); + break; + } + } + } + if (IS_R300_CLASS(screen)) { ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp); if (ret) { @@ -1379,6 +1451,8 @@ radeonCreateScreen2(__DRIscreen *sPriv) screen->extensions[i++] = &r600TexBufferExtension.base; #endif + screen->extensions[i++] = &radeonFlushExtension.base; + screen->extensions[i++] = NULL; sPriv->extensions = screen->extensions; diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h index 0d7e335fa3a..2b33201a538 100644 --- a/src/mesa/drivers/dri/radeon/radeon_screen.h +++ b/src/mesa/drivers/dri/radeon/radeon_screen.h @@ -112,6 +112,13 @@ typedef struct radeon_screen { int kernel_mm; drm_radeon_sarea_t *sarea; /* Private SAREA data */ struct radeon_bo_manager *bom; + + /* r6xx+ tiling */ + GLuint tile_config; + GLint group_bytes; + GLint num_channels; + GLint num_banks; + GLint r7xx_bank_op; } radeonScreenRec, *radeonScreenPtr; #define IS_R100_CLASS(screen) \ diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c index 1adb6096033..9dfe2dd2433 100644 --- a/src/mesa/drivers/dri/radeon/radeon_span.c +++ b/src/mesa/drivers/dri/radeon/radeon_span.c @@ -111,7 +111,6 @@ static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb, * two main types: * - 1D (akin to macro-linear/micro-tiled on older asics) * - 2D (akin to macro-tiled/micro-tiled on older asics) - * only 1D tiling is implemented below */ #if defined(RADEON_R600) static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb, @@ -208,12 +207,190 @@ static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb, return offset; } +static inline GLint r600_log2(GLint n) +{ + GLint log2 = 0; + + while (n >>= 1) + ++log2; + return log2; +} + +static inline GLint r600_2d_tile_helper(const struct radeon_renderbuffer * rrb, + GLint x, GLint y, GLint is_depth, GLint is_stencil) +{ + GLint group_bytes = rrb->group_bytes; + GLint num_channels = rrb->num_channels; + GLint num_banks = rrb->num_banks; + GLint r7xx_bank_op = rrb->r7xx_bank_op; + /* */ + GLint group_bits = r600_log2(group_bytes); + GLint channel_bits = r600_log2(num_channels); + GLint bank_bits = r600_log2(num_banks); + GLint element_bytes = rrb->cpp; + GLint num_samples = 1; + GLint tile_width = 8; + GLint tile_height = 8; + GLint tile_thickness = 1; + GLint macro_tile_width = num_banks; + GLint macro_tile_height = num_channels; + GLint pitch_elements = (rrb->pitch / element_bytes) / tile_width; + GLint height = rrb->base.Height / tile_height; + GLint z = 0; + GLint sample_number = 0; + /* */ + GLint tile_bytes; + GLint macro_tile_bytes; + GLint macro_tiles_per_row; + GLint macro_tiles_per_slice; + GLint slice_offset; + GLint macro_tile_row_index; + GLint macro_tile_column_index; + GLint macro_tile_offset; + GLint pixel_number = 0; + GLint element_offset; + GLint bank = 0; + GLint channel = 0; + GLint total_offset; + GLint group_mask = (1 << group_bits) - 1; + GLint offset_low; + GLint offset_high; + GLint offset = 0; + + switch (num_channels) { + case 2: + default: + // channel[0] = x[3] ^ y[3] + channel |= (((x >> 3) ^ (y >> 3)) & 1) << 0; + break; + case 4: + // channel[0] = x[4] ^ y[3] + channel |= (((x >> 4) ^ (y >> 3)) & 1) << 0; + // channel[1] = x[3] ^ y[4] + channel |= (((x >> 3) ^ (y >> 4)) & 1) << 1; + break; + case 8: + // channel[0] = x[5] ^ y[3] + channel |= (((x >> 5) ^ (y >> 3)) & 1) << 0; + // channel[0] = x[4] ^ x[5] ^ y[4] + channel |= (((x >> 4) ^ (x >> 5) ^ (y >> 4)) & 1) << 1; + // channel[0] = x[3] ^ y[5] + channel |= (((x >> 3) ^ (y >> 5)) & 1) << 2; + break; + } + + switch (num_banks) { + case 4: + // bank[0] = x[3] ^ y[4 + log2(num_channels)] + bank |= (((x >> 3) ^ (y >> (4 + channel_bits))) & 1) << 0; + if (r7xx_bank_op) + // bank[1] = x[3] ^ y[4 + log2(num_channels)] ^ x[5] + bank |= (((x >> 4) ^ (y >> (3 + channel_bits)) ^ (x >> 5)) & 1) << 1; + else + // bank[1] = x[4] ^ y[3 + log2(num_channels)] + bank |= (((x >> 4) ^ (y >> (3 + channel_bits))) & 1) << 1; + break; + case 8: + // bank[0] = x[3] ^ y[5 + log2(num_channels)] + bank |= (((x >> 3) ^ (y >> (5 + channel_bits))) & 1) << 0; + // bank[1] = x[4] ^ y[4 + log2(num_channels)] ^ y[5 + log2(num_channels)] + bank |= (((x >> 4) ^ (y >> (4 + channel_bits)) ^ (y >> (5 + channel_bits))) & 1) << 1; + if (r7xx_bank_op) + // bank[2] = x[5] ^ y[3 + log2(num_channels)] ^ x[6] + bank |= (((x >> 5) ^ (y >> (3 + channel_bits)) ^ (x >> 6)) & 1) << 2; + else + // bank[2] = x[5] ^ y[3 + log2(num_channels)] + bank |= (((x >> 5) ^ (y >> (3 + channel_bits))) & 1) << 2; + break; + } + + tile_bytes = tile_width * tile_height * tile_thickness * element_bytes * num_samples; + macro_tile_bytes = macro_tile_width * macro_tile_height * tile_bytes; + macro_tiles_per_row = pitch_elements / macro_tile_width; + macro_tiles_per_slice = macro_tiles_per_row * (height / macro_tile_height); + slice_offset = (z / tile_thickness) * macro_tiles_per_slice * macro_tile_bytes; + macro_tile_row_index = (y / tile_height) / macro_tile_height; + macro_tile_column_index = (x / tile_width) / macro_tile_width; + macro_tile_offset = ((macro_tile_row_index * macro_tiles_per_row) + macro_tile_column_index) * macro_tile_bytes; + + if (is_depth) { + GLint pixel_offset = 0; + + pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0] + pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0] + pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1] + pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1] + pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2] + pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2] + switch (element_bytes) { + case 2: + pixel_offset = pixel_number * element_bytes * num_samples; + break; + case 4: + /* stencil and depth data are stored separately within a tile. + * stencil is stored in a contiguous tile before the depth tile. + * stencil element is 1 byte, depth element is 3 bytes. + * stencil tile is 64 bytes. + */ + if (is_stencil) + pixel_offset = pixel_number * 1 * num_samples; + else + pixel_offset = (pixel_number * 3 * num_samples) + 64; + break; + } + element_offset = pixel_offset + (sample_number * element_bytes); + } else { + GLint sample_offset; + + switch (element_bytes) { + case 1: + pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0] + pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1] + pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2] + pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1] + pixel_number |= ((y >> 0) & 1) << 4; // pn[4] = y[0] + pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2] + break; + case 2: + pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0] + pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1] + pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2] + pixel_number |= ((y >> 0) & 1) << 3; // pn[3] = y[0] + pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1] + pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2] + break; + case 4: + pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0] + pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1] + pixel_number |= ((y >> 0) & 1) << 2; // pn[2] = y[0] + pixel_number |= ((x >> 2) & 1) << 3; // pn[3] = x[2] + pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1] + pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2] + break; + } + sample_offset = sample_number * (tile_bytes / num_samples); + element_offset = sample_offset + (pixel_number * element_bytes); + } + total_offset = (slice_offset + macro_tile_offset) >> (channel_bits + bank_bits); + total_offset += element_offset; + + offset_low = total_offset & group_mask; + offset_high = (total_offset & ~group_mask) << (channel_bits + bank_bits); + offset = (bank << (group_bits + channel_bits)) + (channel << group_bits) + offset_low + offset_high; + + return offset; +} + /* depth buffers */ static GLubyte *r600_ptr_depth(const struct radeon_renderbuffer * rrb, GLint x, GLint y) { GLubyte *ptr = rrb->bo->ptr; - GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 0); + GLint offset; + if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) + offset = r600_2d_tile_helper(rrb, x, y, 1, 0); + else + offset = r600_1d_tile_helper(rrb, x, y, 1, 0); return &ptr[offset]; } @@ -221,7 +398,11 @@ static GLubyte *r600_ptr_stencil(const struct radeon_renderbuffer * rrb, GLint x, GLint y) { GLubyte *ptr = rrb->bo->ptr; - GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 1); + GLint offset; + if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) + offset = r600_2d_tile_helper(rrb, x, y, 1, 1); + else + offset = r600_1d_tile_helper(rrb, x, y, 1, 1); return &ptr[offset]; } @@ -235,7 +416,10 @@ static GLubyte *r600_ptr_color(const struct radeon_renderbuffer * rrb, if (rrb->has_surface || !(rrb->bo->flags & mask)) { offset = x * rrb->cpp + y * rrb->pitch; } else { - offset = r600_1d_tile_helper(rrb, x, y, 0, 0); + if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) + offset = r600_2d_tile_helper(rrb, x, y, 0, 0); + else + offset = r600_1d_tile_helper(rrb, x, y, 0, 0); } return &ptr[offset]; } diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c index f2fcb46688a..29defe73a70 100644 --- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c +++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c @@ -40,7 +40,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/macros.h" #include "main/simple_list.h" +#include "math/m_xform.h" + #include "swrast_setup/swrast_setup.h" + #include "tnl/tnl.h" #include "tnl/t_context.h" #include "tnl/t_pipeline.h" @@ -408,6 +411,8 @@ static GLboolean radeon_run_render( GLcontext *ctx, !radeon_dma_validate_render( ctx, VB )) return GL_TRUE; + radeon_prepare_render(&rmesa->radeon); + tnl->Driver.Render.Start( ctx ); for (i = 0 ; i < VB->PrimitiveCount ; i++) diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c index ea796e1a45f..5e1718f9dfc 100644 --- a/src/mesa/drivers/dri/radeon/radeon_tcl.c +++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c @@ -252,6 +252,8 @@ void radeonTclPrimitive( GLcontext *ctx, GLuint se_cntl; GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE; + radeon_prepare_render(&rmesa->radeon); + if (newprim != rmesa->tcl.hw_primitive || !discrete_prim[hw_prim&0xf]) { RADEON_NEWPRIM( rmesa ); diff --git a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c index 29fd31ac23f..4cb0bb60c85 100644 --- a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c +++ b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c @@ -153,6 +153,9 @@ radeonCopyTexImage2D(GLcontext *ctx, GLenum target, GLint level, _mesa_select_tex_image(ctx, texObj, target, level); int srcx, srcy, dstx, dsty; + radeonContextPtr radeon = RADEON_CONTEXT(ctx); + radeon_prepare_render(radeon); + if (border) goto fail; @@ -202,6 +205,9 @@ radeonCopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level, struct gl_texture_object *texObj = _mesa_select_tex_object(ctx, texUnit, target); struct gl_texture_image *texImage = _mesa_select_tex_image(ctx, texObj, target, level); + radeonContextPtr radeon = RADEON_CONTEXT(ctx); + radeon_prepare_render(radeon); + if (!do_copy_texsubimage(ctx, target, level, radeon_tex_obj(texObj), (radeon_texture_image *)texImage, xoffset, yoffset, x, y, width, height)) { diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c index d2b190e42e0..8c6a50d2f0d 100644 --- a/src/mesa/drivers/dri/radeon/radeon_texture.c +++ b/src/mesa/drivers/dri/radeon/radeon_texture.c @@ -551,7 +551,7 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx, case GL_SRGB8_ALPHA8: case GL_COMPRESSED_SRGB: case GL_COMPRESSED_SRGB_ALPHA: - return MESA_FORMAT_SRGBA8; + return MESA_FORMAT_SARGB8; case GL_SLUMINANCE: case GL_SLUMINANCE8: diff --git a/src/mesa/drivers/dri/savage/savagerender.c b/src/mesa/drivers/dri/savage/savagerender.c index c369bb124c2..2d9e80e29c4 100644 --- a/src/mesa/drivers/dri/savage/savagerender.c +++ b/src/mesa/drivers/dri/savage/savagerender.c @@ -33,6 +33,8 @@ #include "main/imports.h" #include "main/mtypes.h" +#include "math/m_xform.h" + #include "tnl/t_context.h" #include "savagecontext.h" diff --git a/src/mesa/drivers/dri/unichrome/via_render.c b/src/mesa/drivers/dri/unichrome/via_render.c index 896c43db1b0..4351f119555 100644 --- a/src/mesa/drivers/dri/unichrome/via_render.c +++ b/src/mesa/drivers/dri/unichrome/via_render.c @@ -33,6 +33,8 @@ #include "main/macros.h" #include "main/mtypes.h" +#include "math/m_xform.h" + #include "tnl/t_context.h" #include "via_context.h" |