diff options
author | Eric Anholt <[email protected]> | 2012-11-27 14:10:52 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2012-12-05 14:29:44 -0800 |
commit | 71f06344a0d72a6bd27750ceca571fc016b8de85 (patch) | |
tree | 4a32ebc3e5bff0ad16665a5a0737b2da1c0e0683 | |
parent | ef2fbf67d4bd941a9a0e1c6f8515fb4911e05c50 (diff) |
i965: Add a debug flag for counting cycles spent in each compiled shader.
This can be used for two purposes: Using hand-coded shaders to determine
per-instruction timings, or figuring out which shader to optimize in a
whole application.
Note that this doesn't cover the instructions that set up the message to
the URB/FB write -- we'd need to convert the MRF usage in these
instructions to GRFs so that our offsets/times don't overwrite our
shader outputs.
Reviewed-by: Kenneth Graunke <[email protected]> (v1)
v2: Check the timestamp reset flag in the VS, which is apparently
getting set fairly regularly in the range we watch, resulting in
negative numbers getting added to our 32-bit counter, and thus large
values added to our uint64_t.
v3: Rebase on reladdr changes, removing a new safety check that proved
impossible to satisfy. Add a comment to the AOP defs from Ken's
review, and put them in a slightly more sensible spot.
v4: Check timestamp reset in the FS as well.
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.c | 3 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 28 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_defines.h | 23 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu.h | 6 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu_emit.c | 56 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 120 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 9 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 4 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_program.c | 127 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vec4.cpp | 106 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vec4.h | 9 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 4 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vs_surface_state.c | 10 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vtbl.c | 14 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 7 | ||||
-rw-r--r-- | src/mesa/drivers/dri/intel/intel_context.c | 6 | ||||
-rw-r--r-- | src/mesa/drivers/dri/intel/intel_context.h | 1 |
17 files changed, 524 insertions, 9 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 4b1b247ce7d..5665a3a8517 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -383,6 +383,9 @@ brwCreateContext(int api, brw_fs_alloc_reg_sets(brw); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + brw_init_shader_time(brw); + return true; } diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 1abaee3db1d..dc25cabcf53 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -559,14 +559,15 @@ struct brw_vs_prog_data { #define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1) #define SURF_INDEX_TEXTURE(t) (BRW_MAX_DRAW_BUFFERS + 2 + (t)) #define SURF_INDEX_WM_UBO(u) (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u) - +#define SURF_INDEX_WM_SHADER_TIME (SURF_INDEX_WM_UBO(12)) /** Maximum size of the binding table. */ -#define BRW_MAX_WM_SURFACES SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS) +#define BRW_MAX_WM_SURFACES (SURF_INDEX_WM_SHADER_TIME + 1) #define SURF_INDEX_VERT_CONST_BUFFER (0) #define SURF_INDEX_VS_TEXTURE(t) (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t)) #define SURF_INDEX_VS_UBO(u) (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u) -#define BRW_MAX_VS_SURFACES SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS) +#define SURF_INDEX_VS_SHADER_TIME (SURF_INDEX_VS_UBO(12)) +#define BRW_MAX_VS_SURFACES (SURF_INDEX_VS_SHADER_TIME + 1) #define SURF_INDEX_SOL_BINDING(t) ((t)) #define BRW_MAX_GS_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS) @@ -651,6 +652,13 @@ struct brw_tracked_state { void (*emit)( struct brw_context *brw ); }; +enum shader_time_shader_type { + ST_NONE, + ST_VS, + ST_FS8, + ST_FS16, +}; + /* Flags for brw->state.cache. */ #define CACHE_NEW_BLEND_STATE (1<<BRW_BLEND_STATE) @@ -1089,6 +1097,16 @@ struct brw_context uint32_t num_instances; int basevertex; + + struct { + drm_intel_bo *bo; + struct gl_shader_program **programs; + enum shader_time_shader_type *types; + uint64_t *cumulative; + int num_entries; + int max_entries; + double report_time; + } shader_time; }; /*====================================================================== @@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions ); int brw_get_scratch_size(int size); void brw_get_scratch_bo(struct intel_context *intel, drm_intel_bo **scratch_bo, int size); - +void brw_init_shader_time(struct brw_context *brw); +void brw_collect_and_report_shader_time(struct brw_context *brw); +void brw_destroy_shader_time(struct brw_context *brw); /* brw_urb.c */ diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 3423178b371..5e00b40291d 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -665,6 +665,8 @@ enum opcode { SHADER_OPCODE_TXS, FS_OPCODE_TXB, + SHADER_OPCODE_SHADER_TIME_ADD, + FS_OPCODE_DDX, FS_OPCODE_DDY, FS_OPCODE_PIXEL_X, @@ -731,6 +733,8 @@ enum opcode { #define BRW_ARF_CONTROL 0x80 #define BRW_ARF_NOTIFICATION_COUNT 0x90 #define BRW_ARF_IP 0xA0 +#define BRW_ARF_TDR 0xB0 +#define BRW_ARF_TIMESTAMP 0xC0 #define BRW_MRF_COMPR4 (1 << 7) @@ -913,6 +917,23 @@ enum brw_message_target { #define GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 10 #define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ 3 +/* dataport atomic operations. */ +#define BRW_AOP_AND 1 +#define BRW_AOP_OR 2 +#define BRW_AOP_XOR 3 +#define BRW_AOP_MOV 4 +#define BRW_AOP_INC 5 +#define BRW_AOP_DEC 6 +#define BRW_AOP_ADD 7 +#define BRW_AOP_SUB 8 +#define BRW_AOP_REVSUB 9 +#define BRW_AOP_IMAX 10 +#define BRW_AOP_IMIN 11 +#define BRW_AOP_UMAX 12 +#define BRW_AOP_UMIN 13 +#define BRW_AOP_CMPWR 14 +#define BRW_AOP_PREDEC 15 + #define BRW_MATH_FUNCTION_INV 1 #define BRW_MATH_FUNCTION_LOG 2 #define BRW_MATH_FUNCTION_EXP 3 @@ -960,8 +981,6 @@ enum brw_message_target { #define BRW_SCRATCH_SPACE_SIZE_2M 11 - - #define CMD_URB_FENCE 0x6000 #define CMD_CS_URB_STATE 0x6001 #define CMD_CONST_BUFFER 0x6002 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 8c3a634e2cc..c806e0bb5df 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file, else if (file == BRW_MESSAGE_REGISTER_FILE) assert((nr & ~(1 << 7)) < BRW_MAX_MRF); else if (file == BRW_ARCHITECTURE_REGISTER_FILE) - assert(nr <= BRW_ARF_IP); + assert(nr <= BRW_ARF_TIMESTAMP); reg.type = type; reg.file = file; @@ -1006,6 +1006,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p, int num_regs, GLuint offset); +void brw_shader_time_add(struct brw_compile *p, + int mrf, + uint32_t surf_index); + /* If/else/endif. Works by manipulating the execution flags on each * channel. */ diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 8a629ff0b40..fb1255f728c 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -253,7 +253,6 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn, assert(!reg.negate); assert(!reg.abs); assert(reg.address_mode == BRW_ADDRESS_DIRECT); - assert(reg.vstride != BRW_VERTICAL_STRIDE_0); } validate_reg(insn, reg); @@ -332,7 +331,8 @@ void brw_set_src1(struct brw_compile *p, { assert(reg.file != BRW_MESSAGE_REGISTER_FILE); - assert(reg.nr < 128); + if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE) + assert(reg.nr < 128); gen7_convert_mrf_to_grf(p, ®); @@ -2448,3 +2448,55 @@ brw_svb_write(struct brw_compile *p, 0, /* end_of_thread */ send_commit_msg); /* send_commit_msg */ } + +/** + * This instruction is generated as a single-channel align1 instruction by + * both the VS and FS stages when using INTEL_DEBUG=shader_time. + * + * We can't use the typed atomic op in the FS because that has the execution + * mask ANDed with the pixel mask, but we just want to write the one dword for + * all the pixels. + * + * We don't use the SIMD4x2 atomic ops in the VS because want to just write + * one u32. So we use the same untyped atomic write message as the pixel + * shader. + * + * The untyped atomic operation requires a BUFFER surface type with RAW + * format, and is only accessible through the legacy DATA_CACHE dataport + * messages. + */ +void brw_shader_time_add(struct brw_compile *p, + int base_mrf, + uint32_t surf_index) +{ + struct intel_context *intel = &p->brw->intel; + assert(intel->gen >= 7); + + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + brw_set_mask_control(p, BRW_MASK_DISABLE); + struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_pop_insn_state(p); + + /* We use brw_vec1_reg and unmasked because we want to increment the given + * offset only once. + */ + brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NULL, 0)); + brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + base_mrf, 0)); + + bool header_present = false; + bool eot = false; + uint32_t mlen = 2; /* offset, value */ + uint32_t rlen = 0; + brw_set_message_descriptor(p, send, + GEN7_SFID_DATAPORT_DATA_CACHE, + mlen, rlen, header_present, eot); + + send->bits3.ud |= 6 << 14; /* untyped atomic op */ + send->bits3.ud |= 0 << 13; /* no return data */ + send->bits3.ud |= 1 << 12; /* SIMD8 mode */ + send->bits3.ud |= BRW_AOP_ADD << 8; + send->bits3.ud |= surf_index << 0; +} diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 9ed91632cbe..d77a67e0240 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -459,6 +459,118 @@ fs_visitor::type_size(const struct glsl_type *type) } } +fs_reg +fs_visitor::get_timestamp() +{ + assert(intel->gen >= 7); + + fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0), + BRW_REGISTER_TYPE_UD)); + + fs_reg dst = fs_reg(this, glsl_type::uint_type); + + fs_inst *mov = emit(MOV(dst, ts)); + /* We want to read the 3 fields we care about (mostly field 0, but also 2) + * even if it's not enabled in the dispatch. + */ + mov->force_writemask_all = true; + mov->force_uncompressed = true; + + /* The caller wants the low 32 bits of the timestamp. Since it's running + * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, + * which is plenty of time for our purposes. It is identical across the + * EUs, but since it's tracking GPU core speed it will increment at a + * varying rate as render P-states change. + * + * The caller could also check if render P-states have changed (or anything + * else that might disrupt timing) by setting smear to 2 and checking if + * that field is != 0. + */ + dst.smear = 0; + + return dst; +} + +void +fs_visitor::emit_shader_time_begin() +{ + current_annotation = "shader time start"; + shader_start_time = get_timestamp(); +} + +void +fs_visitor::emit_shader_time_end() +{ + current_annotation = "shader time end"; + + enum shader_time_shader_type type; + if (dispatch_width == 8) { + type = ST_FS8; + } else { + assert(dispatch_width == 16); + type = ST_FS16; + } + + emit_shader_time_write(type, shader_start_time, get_timestamp()); +} + +void +fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, + fs_reg start, fs_reg end) +{ + /* Choose an index in the buffer and set up tracking information for our + * printouts. + */ + int shader_time_index = brw->shader_time.num_entries++; + assert(shader_time_index <= brw->shader_time.max_entries); + brw->shader_time.types[shader_time_index] = type; + if (prog) { + _mesa_reference_shader_program(ctx, + &brw->shader_time.programs[shader_time_index], + prog); + } + + /* Check that there weren't any timestamp reset events (assuming these + * were the only two timestamp reads that happened). + */ + fs_reg reset = end; + reset.smear = 2; + fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u))); + test->conditional_mod = BRW_CONDITIONAL_Z; + emit(IF(BRW_PREDICATE_NORMAL)); + + push_force_uncompressed(); + start.negate = true; + fs_reg diff = fs_reg(this, glsl_type::uint_type); + emit(ADD(diff, start, end)); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + emit(ADD(diff, diff, fs_reg(-2u))); + + int base_mrf = 6; + + fs_reg offset_mrf = fs_reg(MRF, base_mrf); + offset_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(offset_mrf, fs_reg(shader_time_index * 4))); + + fs_reg time_mrf = fs_reg(MRF, base_mrf + 1); + time_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(time_mrf, diff)); + + fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD)); + inst->base_mrf = base_mrf; + inst->mlen = 2; + + pop_force_uncompressed(); + + emit(BRW_OPCODE_ENDIF); +} + void fs_visitor::fail(const char *format, ...) { @@ -571,6 +683,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: return 1; + case SHADER_OPCODE_SHADER_TIME_ADD: + return 0; case FS_OPCODE_FB_WRITE: return 2; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: @@ -2295,6 +2409,9 @@ fs_visitor::run() if (0) { emit_dummy_fs(); } else { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_begin(); + calculate_urb_setup(); if (intel->gen < 6) emit_interpolation_setup_gen4(); @@ -2318,6 +2435,9 @@ fs_visitor::run() if (failed) return false; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + emit_fb_writes(); split_virtual_grfs(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 903d7eda088..51efc113fcc 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -363,6 +363,12 @@ public: void emit_color_write(int target, int index, int first_color_mrf); void emit_fb_writes(); + + void emit_shader_time_begin(); + void emit_shader_time_end(); + void emit_shader_time_write(enum shader_time_shader_type type, + fs_reg start, fs_reg end); + bool try_rewrite_rhs_to_dst(ir_assignment *ir, fs_reg dst, fs_reg src, @@ -373,6 +379,8 @@ public: void resolve_ud_negate(fs_reg *reg); void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg); + fs_reg get_timestamp(); + struct brw_reg interp_reg(int location, int channel); int setup_uniform_values(int loc, const glsl_type *type); void setup_builtin_uniform_values(ir_variable *ir); @@ -435,6 +443,7 @@ public: fs_reg pixel_w; fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; + fs_reg shader_start_time; int grf_used; diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index 87a7e9b81c5..4e8b44e5684 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -1124,6 +1124,10 @@ fs_generator::generate_code(exec_list *instructions) generate_mov_dispatch_to_flags(); break; + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME); + break; + default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode `%s' in FS", diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 6bf5a6a0002..1859041bc27 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -189,3 +189,130 @@ void brwInitFragProgFuncs( struct dd_function_table *functions ) functions->LinkShader = brw_link_shader; } +void +brw_init_shader_time(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + const int max_entries = 4096; + brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time", + max_entries * 4, 4096); + brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *, + max_entries); + brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type, + max_entries); + brw->shader_time.cumulative = rzalloc_array(brw, uint64_t, + max_entries); + brw->shader_time.max_entries = max_entries; +} + +static int +compare_time(const void *a, const void *b) +{ + uint64_t * const *a_val = a; + uint64_t * const *b_val = b; + + /* We don't just subtract because we're turning the value to an int. */ + if (**a_val < **b_val) + return -1; + else if (**a_val == **b_val) + return 0; + else + return 1; +} + +static void +brw_report_shader_time(struct brw_context *brw) +{ + if (!brw->shader_time.bo || !brw->shader_time.num_entries) + return; + + uint64_t *sorted[brw->shader_time.num_entries]; + double total = 0; + for (int i = 0; i < brw->shader_time.num_entries; i++) { + sorted[i] = &brw->shader_time.cumulative[i]; + total += brw->shader_time.cumulative[i]; + } + + if (total == 0) { + printf("No shader time collected yet\n"); + return; + } + + qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time); + + printf("\n"); + printf("type ID cycles spent %% of total\n"); + for (int s = 0; s < brw->shader_time.num_entries; s++) { + /* Work back from the sorted pointers times to a time to print. */ + int i = sorted[s] - brw->shader_time.cumulative; + + int shader_num = -1; + if (brw->shader_time.programs[i]) { + shader_num = brw->shader_time.programs[i]->Name; + } + + switch (brw->shader_time.types[i]) { + case ST_VS: + printf("vs %4d: ", shader_num); + break; + case ST_FS8: + printf("fs8 %4d: ", shader_num); + break; + case ST_FS16: + printf("fs16 %4d: ", shader_num); + break; + default: + printf("other: "); + break; + } + + printf("%16lld (%7.2f Gcycles) %4.1f%%\n", + (long long)brw->shader_time.cumulative[i], + (double)brw->shader_time.cumulative[i] / 1000000000.0, + (double)brw->shader_time.cumulative[i] / total * 100.0); + } +} + +static void +brw_collect_shader_time(struct brw_context *brw) +{ + if (!brw->shader_time.bo) + return; + + /* This probably stalls on the last rendering. We could fix that by + * delaying reading the reports, but it doesn't look like it's a big + * overhead compared to the cost of tracking the time in the first place. + */ + drm_intel_bo_map(brw->shader_time.bo, true); + + uint32_t *times = brw->shader_time.bo->virtual; + + for (int i = 0; i < brw->shader_time.num_entries; i++) { + brw->shader_time.cumulative[i] += times[i]; + } + + /* Zero the BO out to clear it out for our next collection. + */ + memset(times, 0, brw->shader_time.bo->size); + drm_intel_bo_unmap(brw->shader_time.bo); +} + +void +brw_collect_and_report_shader_time(struct brw_context *brw) +{ + brw_collect_shader_time(brw); + + if (brw->shader_time.report_time == 0 || + get_time() - brw->shader_time.report_time >= 1.0) { + brw_report_shader_time(brw); + brw->shader_time.report_time = get_time(); + } +} + +void +brw_destroy_shader_time(struct brw_context *brw) +{ + drm_intel_bo_unreference(brw->shader_time.bo); + brw->shader_time.bo = NULL; +} diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 5200daac043..dc9d9d5d1a5 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -26,6 +26,7 @@ extern "C" { #include "main/macros.h" +#include "main/shaderobj.h" #include "program/prog_print.h" #include "program/prog_parameter.h" } @@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) return 2; case VS_OPCODE_SCRATCH_WRITE: return 3; + case SHADER_OPCODE_SHADER_TIME_ADD: + return 0; default: assert(!"not reached"); return inst->mlen; @@ -1039,9 +1042,109 @@ vec4_visitor::setup_payload(void) this->first_non_payload_grf = reg; } +src_reg +vec4_visitor::get_timestamp() +{ + assert(intel->gen >= 7); + + src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_4, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW)); + + dst_reg dst = dst_reg(this, glsl_type::uvec4_type); + + vec4_instruction *mov = emit(MOV(dst, ts)); + /* We want to read the 3 fields we care about (mostly field 0, but also 2) + * even if it's not enabled in the dispatch. + */ + mov->force_writemask_all = true; + + return src_reg(dst); +} + +void +vec4_visitor::emit_shader_time_begin() +{ + current_annotation = "shader time start"; + shader_start_time = get_timestamp(); +} + +void +vec4_visitor::emit_shader_time_end() +{ + current_annotation = "shader time end"; + src_reg shader_end_time = get_timestamp(); + + emit_shader_time_write(ST_VS, shader_start_time, shader_end_time); +} + +void +vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type, + src_reg start, src_reg end) +{ + /* Choose an index in the buffer and set up tracking information for our + * printouts. + */ + int shader_time_index = brw->shader_time.num_entries++; + assert(shader_time_index <= brw->shader_time.max_entries); + brw->shader_time.types[shader_time_index] = type; + if (prog) { + _mesa_reference_shader_program(ctx, + &brw->shader_time.programs[shader_time_index], + prog); + } + + /* Check that there weren't any timestamp reset events (assuming these + * were the only two timestamp reads that happened). + */ + src_reg reset_end = end; + reset_end.swizzle = BRW_SWIZZLE_ZZZZ; + vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u))); + test->conditional_mod = BRW_CONDITIONAL_Z; + + emit(IF(BRW_PREDICATE_NORMAL)); + + /* Take the current timestamp and get the delta. */ + start.negate = true; + dst_reg diff = dst_reg(this, glsl_type::uint_type); + emit(ADD(diff, start, end)); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + emit(ADD(diff, src_reg(diff), src_reg(-2u))); + + int base_mrf = 6; + + dst_reg offset_mrf = dst_reg(MRF, base_mrf); + offset_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(offset_mrf, src_reg(shader_time_index * 4))); + + dst_reg time_mrf = dst_reg(MRF, base_mrf + 1); + time_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(time_mrf, src_reg(diff))); + + vec4_instruction *inst; + inst = emit(SHADER_OPCODE_SHADER_TIME_ADD); + inst->base_mrf = base_mrf; + inst->mlen = 2; + + emit(BRW_OPCODE_ENDIF); +} + bool vec4_visitor::run() { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_begin(); + emit_attribute_fixups(); /* Generate VS IR for main(). (the visitor only descends into @@ -1057,6 +1160,9 @@ vec4_visitor::run() if (c->key.userclip_active && !c->key.uses_clip_distance) setup_uniform_clipplane_values(); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + emit_urb_writes(); /* Before any optimization, push array accesses out to scratch diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 8c6f56a2fdc..91d9545559c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -302,6 +302,8 @@ public: int uniform_vector_size[MAX_UNIFORMS]; int uniforms; + src_reg shader_start_time; + struct hash_table *variable_ht; bool run(void); @@ -434,6 +436,11 @@ public: void emit_urb_slot(int mrf, int vert_result); void emit_urb_writes(void); + void emit_shader_time_begin(); + void emit_shader_time_end(); + void emit_shader_time_write(enum shader_time_shader_type type, + src_reg start, src_reg end); + src_reg get_scratch_offset(vec4_instruction *inst, src_reg *reladdr, int reg_offset); src_reg get_pull_constant_offset(vec4_instruction *inst, @@ -452,6 +459,8 @@ public: bool try_emit_sat(ir_expression *ir); void resolve_ud_negate(src_reg *reg); + src_reg get_timestamp(); + bool process_move_condition(ir_rvalue *ir); void dump_instruction(vec4_instruction *inst); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index 0c6b358af6c..9fa742d740c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -660,6 +660,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction, generate_pull_constant_load(inst, dst, src[0], src[1]); break; + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME); + break; + default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n", diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index d70c36ef71d..3985b4811fa 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = { static void brw_vs_upload_binding_table(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; uint32_t *bind; int i; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0, + brw->shader_time.bo->size, + &brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]); + + assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME); + brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME; + } + /* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or * pull constants. */ diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c index 0da6070ec1c..f2f0e7b9590 100644 --- a/src/mesa/drivers/dri/i965/brw_vtbl.c +++ b/src/mesa/drivers/dri/i965/brw_vtbl.c @@ -43,6 +43,7 @@ #include "intel_fbo.h" #include "brw_context.h" +#include "brw_program.h" #include "brw_defines.h" #include "brw_state.h" #include "brw_draw.h" @@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel ) { struct brw_context *brw = brw_context(&intel->ctx); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + brw_collect_and_report_shader_time(brw); + brw_destroy_shader_time(brw); + } + brw_destroy_state(brw); brw_draw_destroy( brw ); @@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel ) * next batch. */ brw->cache.bo_used_by_gpu = true; + + /* We need to periodically reap the shader time results, because rollover + * happens every few seconds. We also want to see results every once in a + * while, because many programs won't cleanly destroy our context, so the + * end-of-run printout may not happen. + */ + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + brw_collect_and_report_shader_time(brw); } static void brw_invalidate_state( struct intel_context *intel, GLuint new_state ) diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index a7866d583a5..66301a9b08c 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = { static void brw_upload_wm_binding_table(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; uint32_t *bind; int i; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0, + brw->shader_time.bo->size, + &brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]); + } + /* Might want to calculate nr_surfaces first, to avoid taking up so much * space for the binding table. */ diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c index 15b77b21438..b95972f8e9d 100644 --- a/src/mesa/drivers/dri/intel/intel_context.c +++ b/src/mesa/drivers/dri/intel/intel_context.c @@ -492,6 +492,7 @@ static const struct dri_debug_control debug_control[] = { { "vs", DEBUG_VS }, { "clip", DEBUG_CLIP }, { "aub", DEBUG_AUB }, + { "shader_time", DEBUG_SHADER_TIME }, { NULL, 0 } }; @@ -747,6 +748,11 @@ intelInitContext(struct intel_context *intel, INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control); if (INTEL_DEBUG & DEBUG_BUFMGR) dri_bufmgr_set_debug(intel->bufmgr, true); + if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) { + fprintf(stderr, + "shader_time debugging requires gen7 (Ivybridge) or better.\n"); + INTEL_DEBUG &= ~DEBUG_SHADER_TIME; + } if (INTEL_DEBUG & DEBUG_AUB) drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true); diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h index eeefadffb29..8d544dccaef 100644 --- a/src/mesa/drivers/dri/intel/intel_context.h +++ b/src/mesa/drivers/dri/intel/intel_context.h @@ -456,6 +456,7 @@ extern int INTEL_DEBUG; #define DEBUG_VS 0x1000000 #define DEBUG_CLIP 0x2000000 #define DEBUG_AUB 0x4000000 +#define DEBUG_SHADER_TIME 0x8000000 #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "INTEL-MESA" |