/************************************************************************** * * Copyright 2006 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ #include "intel_batchbuffer.h" #include "intel_buffer_objects.h" #include "intel_reg.h" #include "intel_bufmgr.h" #include "intel_buffers.h" #include "intel_fbo.h" #include "brw_context.h" #include #include static void intel_batchbuffer_reset(struct brw_context *brw); void intel_batchbuffer_init(struct brw_context *brw) { intel_batchbuffer_reset(brw); if (brw->gen >= 6) { /* We can't just use brw_state_batch to get a chunk of space for * the gen6 workaround because it involves actually writing to * the buffer, and the kernel doesn't let us write to the batch. */ brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr, "pipe_control workaround", 4096, 4096); } if (!brw->has_llc) { brw->batch.cpu_map = malloc(BATCH_SZ); brw->batch.map = brw->batch.cpu_map; } } static void intel_batchbuffer_reset(struct brw_context *brw) { if (brw->batch.last_bo != NULL) { drm_intel_bo_unreference(brw->batch.last_bo); brw->batch.last_bo = NULL; } brw->batch.last_bo = brw->batch.bo; brw_render_cache_set_clear(brw); brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer", BATCH_SZ, 4096); if (brw->has_llc) { drm_intel_bo_map(brw->batch.bo, true); brw->batch.map = brw->batch.bo->virtual; } brw->batch.reserved_space = BATCH_RESERVED; brw->batch.state_batch_offset = brw->batch.bo->size; brw->batch.used = 0; brw->batch.needs_sol_reset = false; brw->batch.pipe_controls_since_last_cs_stall = 0; /* We don't know what ring the new batch will be sent to until we see the * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown. */ brw->batch.ring = UNKNOWN_RING; } void intel_batchbuffer_save_state(struct brw_context *brw) { brw->batch.saved.used = brw->batch.used; brw->batch.saved.reloc_count = drm_intel_gem_bo_get_reloc_count(brw->batch.bo); } void intel_batchbuffer_reset_to_saved(struct brw_context *brw) { drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count); brw->batch.used = brw->batch.saved.used; if (brw->batch.used == 0) brw->batch.ring = UNKNOWN_RING; } void intel_batchbuffer_free(struct brw_context *brw) { free(brw->batch.cpu_map); drm_intel_bo_unreference(brw->batch.last_bo); drm_intel_bo_unreference(brw->batch.bo); drm_intel_bo_unreference(brw->batch.workaround_bo); } static void do_batch_dump(struct brw_context *brw) { struct drm_intel_decode *decode; struct intel_batchbuffer *batch = &brw->batch; int ret; decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID); if (!decode) return; ret = drm_intel_bo_map(batch->bo, false); if (ret == 0) { drm_intel_decode_set_batch_pointer(decode, batch->bo->virtual, batch->bo->offset64, batch->used); } else { fprintf(stderr, "WARNING: failed to map batchbuffer (%s), " "dumping uploaded data instead.\n", strerror(ret)); drm_intel_decode_set_batch_pointer(decode, batch->map, batch->bo->offset64, batch->used); } drm_intel_decode_set_output_file(decode, stderr); drm_intel_decode(decode); drm_intel_decode_context_free(decode); if (ret == 0) { drm_intel_bo_unmap(batch->bo); brw_debug_batch(brw); } } void intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw) { /* We may need to enable and snapshot OA counters. */ brw_perf_monitor_new_batch(brw); } /** * Called when starting a new batch buffer. */ static void brw_new_batch(struct brw_context *brw) { /* Create a new batchbuffer and reset the associated state: */ drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0); intel_batchbuffer_reset(brw); /* If the kernel supports hardware contexts, then most hardware state is * preserved between batches; we only need to re-emit state that is required * to be in every batch. Otherwise we need to re-emit all the state that * would otherwise be stored in the context (which for all intents and * purposes means everything). */ if (brw->hw_ctx == NULL) brw->ctx.NewDriverState |= BRW_NEW_CONTEXT; brw->ctx.NewDriverState |= BRW_NEW_BATCH; brw->state_batch_count = 0; brw->ib.type = -1; /* We need to periodically reap the shader time results, because rollover * happens every few seconds. We also want to see results every once in a * while, because many programs won't cleanly destroy our context, so the * end-of-run printout may not happen. */ if (INTEL_DEBUG & DEBUG_SHADER_TIME) brw_collect_and_report_shader_time(brw); if (INTEL_DEBUG & DEBUG_PERFMON) brw_dump_perf_monitors(brw); } /** * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and * sending it off. * * This function can emit state (say, to preserve registers that aren't saved * between batches). All of this state MUST fit in the reserved space at the * end of the batchbuffer. If you add more GPU state, increase the reserved * space by updating the BATCH_RESERVED macro. */ static void brw_finish_batch(struct brw_context *brw) { /* Capture the closing pipeline statistics register values necessary to * support query objects (in the non-hardware context world). */ brw_emit_query_end(brw); /* We may also need to snapshot and disable OA counters. */ if (brw->batch.ring == RENDER_RING) brw_perf_monitor_finish_batch(brw); /* Mark that the current program cache BO has been used by the GPU. * It will be reallocated if we need to put new programs in for the * next batch. */ brw->cache.bo_used_by_gpu = true; } static void throttle(struct brw_context *brw) { /* Wait for the swapbuffers before the one we just emitted, so we * don't get too many swaps outstanding for apps that are GPU-heavy * but not CPU-heavy. * * We're using intelDRI2Flush (called from the loader before * swapbuffer) and glFlush (for front buffer rendering) as the * indicator that a frame is done and then throttle when we get * here as we prepare to render the next frame. At this point for * round trips for swap/copy and getting new buffers are done and * we'll spend less time waiting on the GPU. * * Unfortunately, we don't have a handle to the batch containing * the swap, and getting our hands on that doesn't seem worth it, * so we just use the first batch we emitted after the last swap. */ if (brw->need_swap_throttle && brw->throttle_batch[0]) { if (brw->throttle_batch[1]) { if (!brw->disable_throttling) drm_intel_bo_wait_rendering(brw->throttle_batch[1]); drm_intel_bo_unreference(brw->throttle_batch[1]); } brw->throttle_batch[1] = brw->throttle_batch[0]; brw->throttle_batch[0] = NULL; brw->need_swap_throttle = false; /* Throttling here is more precise than the throttle ioctl, so skip it */ brw->need_flush_throttle = false; } if (brw->need_flush_throttle) { __DRIscreen *psp = brw->intelScreen->driScrnPriv; drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE); brw->need_flush_throttle = false; } } /* TODO: Push this whole function into bufmgr. */ static int do_flush_locked(struct brw_context *brw) { struct intel_batchbuffer *batch = &brw->batch; int ret = 0; if (brw->has_llc) { drm_intel_bo_unmap(batch->bo); } else { ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map); if (ret == 0 && batch->state_batch_offset != batch->bo->size) { ret = drm_intel_bo_subdata(batch->bo, batch->state_batch_offset, batch->bo->size - batch->state_batch_offset, (char *)batch->map + batch->state_batch_offset); } } if (!brw->intelScreen->no_hw) { int flags; if (brw->gen >= 6 && batch->ring == BLT_RING) { flags = I915_EXEC_BLT; } else { flags = I915_EXEC_RENDER; } if (batch->needs_sol_reset) flags |= I915_EXEC_GEN7_SOL_RESET; if (ret == 0) { if (unlikely(INTEL_DEBUG & DEBUG_AUB)) brw_annotate_aub(brw); if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) { ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0, flags); } else { ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx, 4 * batch->used, flags); } } throttle(brw); } if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) do_batch_dump(brw); if (ret != 0) { fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret)); exit(1); } return ret; } int _intel_batchbuffer_flush(struct brw_context *brw, const char *file, int line) { int ret; if (brw->batch.used == 0) return 0; if (brw->throttle_batch[0] == NULL) { brw->throttle_batch[0] = brw->batch.bo; drm_intel_bo_reference(brw->throttle_batch[0]); } if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { int bytes_for_commands = 4 * brw->batch.used; int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset; int total_bytes = bytes_for_commands + bytes_for_state; fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + " "%4db (state) = %4db (%0.1f%%)\n", file, line, bytes_for_commands, bytes_for_state, total_bytes, 100.0f * total_bytes / BATCH_SZ); } brw->batch.reserved_space = 0; brw_finish_batch(brw); /* Mark the end of the buffer. */ intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END); if (brw->batch.used & 1) { /* Round batchbuffer usage to 2 DWORDs. */ intel_batchbuffer_emit_dword(brw, MI_NOOP); } intel_upload_finish(brw); /* Check that we didn't just wrap our batchbuffer at a bad time. */ assert(!brw->no_batch_wrap); ret = do_flush_locked(brw); if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) { fprintf(stderr, "waiting for idle\n"); drm_intel_bo_wait_rendering(brw->batch.bo); } /* Start a new batch buffer. */ brw_new_batch(brw); return ret; } /* This is the only way buffers get added to the validate list. */ bool intel_batchbuffer_emit_reloc(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain, uint32_t delta) { int ret; ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, buffer, delta, read_domains, write_domain); assert(ret == 0); (void)ret; /* Using the old buffer offset, write in what the right data would be, in * case the buffer doesn't move and we can short-circuit the relocation * processing in the kernel */ intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta); return true; } bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain, uint32_t delta) { int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, buffer, delta, read_domains, write_domain); assert(ret == 0); (void) ret; /* Using the old buffer offset, write in what the right data would be, in * case the buffer doesn't move and we can short-circuit the relocation * processing in the kernel */ uint64_t offset = buffer->offset64 + delta; intel_batchbuffer_emit_dword(brw, offset); intel_batchbuffer_emit_dword(brw, offset >> 32); return true; } void intel_batchbuffer_data(struct brw_context *brw, const void *data, GLuint bytes, enum brw_gpu_ring ring) { assert((bytes & 3) == 0); intel_batchbuffer_require_space(brw, bytes, ring); memcpy(brw->batch.map + brw->batch.used, data, bytes); brw->batch.used += bytes >> 2; } static void load_sized_register_mem(struct brw_context *brw, uint32_t reg, drm_intel_bo *bo, uint32_t read_domains, uint32_t write_domain, uint32_t offset, int size) { int i; /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */ assert(brw->gen >= 7); if (brw->gen >= 8) { BEGIN_BATCH(4 * size); for (i = 0; i < size; i++) { OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2)); OUT_BATCH(reg + i * 4); OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4); } ADVANCE_BATCH(); } else { BEGIN_BATCH(3 * size); for (i = 0; i < size; i++) { OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2)); OUT_BATCH(reg + i * 4); OUT_RELOC(bo, read_domains, write_domain, offset + i * 4); } ADVANCE_BATCH(); } } void brw_load_register_mem(struct brw_context *brw, uint32_t reg, drm_intel_bo *bo, uint32_t read_domains, uint32_t write_domain, uint32_t offset) { load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1); } void brw_load_register_mem64(struct brw_context *brw, uint32_t reg, drm_intel_bo *bo, uint32_t read_domains, uint32_t write_domain, uint32_t offset) { load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2); }