summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJerome Glisse <[email protected]>2013-01-07 17:45:59 -0500
committerJerome Glisse <[email protected]>2013-01-28 11:30:35 -0500
commit325422c49449acdd8df1eb2ca8ed81f7696c38cc (patch)
tree7cbc1aafe166dd925c5626c7118f523c8b75ae6d
parentbff07638a86d36ac826fb287214eda9ce31c02ad (diff)
r600g: add async for staging buffer upload v2
v2: Add virtual address to dma src/dst offset for cayman Signed-off-by: Jerome Glisse <[email protected]>
-rw-r--r--src/gallium/drivers/r600/evergreen_hw_context.c46
-rw-r--r--src/gallium/drivers/r600/evergreen_state.c201
-rw-r--r--src/gallium/drivers/r600/evergreend.h15
-rw-r--r--src/gallium/drivers/r600/r600.h27
-rw-r--r--src/gallium/drivers/r600/r600_buffer.c25
-rw-r--r--src/gallium/drivers/r600/r600_hw_context.c48
-rw-r--r--src/gallium/drivers/r600/r600_pipe.c6
-rw-r--r--src/gallium/drivers/r600/r600_pipe.h9
-rw-r--r--src/gallium/drivers/r600/r600_state.c190
-rw-r--r--src/gallium/drivers/r600/r600_state_common.c6
-rw-r--r--src/gallium/drivers/r600/r600_texture.c24
-rw-r--r--src/gallium/drivers/r600/r600d.h15
12 files changed, 595 insertions, 17 deletions
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index fa90c9a9b74..ca4f4b3bdf4 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -26,6 +26,7 @@
#include "r600_hw_context_priv.h"
#include "evergreend.h"
#include "util/u_memory.h"
+#include "util/u_math.h"
static const struct r600_reg cayman_config_reg_list[] = {
{R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0},
@@ -238,3 +239,48 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
}
}
+
+void evergreen_dma_copy(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ struct pipe_resource *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long size)
+{
+ struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+ unsigned i, ncopy, csize, sub_cmd, shift;
+ struct r600_resource *rdst = (struct r600_resource*)dst;
+ struct r600_resource *rsrc = (struct r600_resource*)src;
+
+ /* make sure that the dma ring is only one active */
+ rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+ dst_offset += r600_resource_va(&rctx->screen->screen, dst);
+ src_offset += r600_resource_va(&rctx->screen->screen, src);
+
+ /* see if we use dword or byte copy */
+ if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
+ size >>= 2;
+ sub_cmd = 0x00;
+ shift = 2;
+ } else {
+ sub_cmd = 0x40;
+ shift = 0;
+ }
+ ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+
+ r600_need_dma_space(rctx, ncopy * 5);
+ for (i = 0; i < ncopy; i++) {
+ csize = size < 0x000fffff ? size : 0x000fffff;
+ /* emit reloc before writting cs so that cs is always in consistent state */
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ);
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE);
+ cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize);
+ cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
+ cs->buf[cs->cdw++] = src_offset & 0xffffffff;
+ cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+ cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+ dst_offset += csize << shift;
+ src_offset += csize << shift;
+ size -= csize;
+ }
+}
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index d03c376f0fc..be1c427ce29 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -30,6 +30,20 @@
#include "util/u_framebuffer.h"
#include "util/u_dual_blend.h"
#include "evergreen_compute.h"
+#include "util/u_math.h"
+
+static INLINE unsigned evergreen_array_mode(unsigned mode)
+{
+ switch (mode) {
+ case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_028C70_ARRAY_LINEAR_ALIGNED;
+ break;
+ case RADEON_SURF_MODE_1D: return V_028C70_ARRAY_1D_TILED_THIN1;
+ break;
+ case RADEON_SURF_MODE_2D: return V_028C70_ARRAY_2D_TILED_THIN1;
+ default:
+ case RADEON_SURF_MODE_LINEAR: return V_028C70_ARRAY_LINEAR_GENERAL;
+ }
+}
static uint32_t eg_num_banks(uint32_t nbanks)
{
@@ -3445,3 +3459,190 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
rctx->db_misc_state.atom.dirty = true;
}
}
+
+static void evergreen_dma_copy_tile(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x,
+ unsigned dst_y,
+ unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ unsigned src_x,
+ unsigned src_y,
+ unsigned src_z,
+ unsigned copy_height,
+ unsigned pitch,
+ unsigned bpp)
+{
+ struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+ struct r600_texture *rsrc = (struct r600_texture*)src;
+ struct r600_texture *rdst = (struct r600_texture*)dst;
+ unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+ unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+ unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split;
+ unsigned long base, addr;
+
+ /* make sure that the dma ring is only one active */
+ rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+ dst_mode = rdst->surface.level[dst_level].mode;
+ src_mode = rsrc->surface.level[src_level].mode;
+ /* downcast linear aligned to linear to simplify test */
+ src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+ dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+ assert(dst_mode != src_mode);
+
+ y = 0;
+ sub_cmd = 0x8;
+ lbpp = util_logbase2(bpp);
+ pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+ nbanks = eg_num_banks(rctx->screen->tiling_info.num_banks);
+
+ if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+ /* T2L */
+ array_mode = evergreen_array_mode(src_mode);
+ slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+ /* linear height must be the same as the slice tile max height, it's ok even
+ * if the linear destination/source have smaller heigh as the size of the
+ * dma packet will be using the copy_height which is always smaller or equal
+ * to the linear height
+ */
+ height = rsrc->surface.level[src_level].npix_y;
+ detile = 1;
+ x = src_x;
+ y = src_y;
+ z = src_z;
+ base = rsrc->surface.level[src_level].offset;
+ addr = rdst->surface.level[dst_level].offset;
+ addr += rdst->surface.level[dst_level].slice_size * dst_z;
+ addr += dst_y * pitch + dst_x * bpp;
+ bank_h = eg_bank_wh(rsrc->surface.bankh);
+ bank_w = eg_bank_wh(rsrc->surface.bankw);
+ mt_aspect = eg_macro_tile_aspect(rsrc->surface.mtilea);
+ tile_split = eg_tile_split(rsrc->surface.tile_split);
+ base += r600_resource_va(&rctx->screen->screen, src);
+ addr += r600_resource_va(&rctx->screen->screen, dst);
+ } else {
+ /* L2T */
+ array_mode = evergreen_array_mode(dst_mode);
+ slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+ /* linear height must be the same as the slice tile max height, it's ok even
+ * if the linear destination/source have smaller heigh as the size of the
+ * dma packet will be using the copy_height which is always smaller or equal
+ * to the linear height
+ */
+ height = rdst->surface.level[dst_level].npix_y;
+ detile = 0;
+ x = dst_x;
+ y = dst_y;
+ z = dst_z;
+ base = rdst->surface.level[dst_level].offset;
+ addr = rsrc->surface.level[src_level].offset;
+ addr += rsrc->surface.level[src_level].slice_size * src_z;
+ addr += src_y * pitch + src_x * bpp;
+ bank_h = eg_bank_wh(rdst->surface.bankh);
+ bank_w = eg_bank_wh(rdst->surface.bankw);
+ mt_aspect = eg_macro_tile_aspect(rdst->surface.mtilea);
+ tile_split = eg_tile_split(rdst->surface.tile_split);
+ base += r600_resource_va(&rctx->screen->screen, dst);
+ addr += r600_resource_va(&rctx->screen->screen, src);
+ }
+
+ size = (copy_height * pitch) >> 2;
+ ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+ r600_need_dma_space(rctx, ncopy * 9);
+
+ for (i = 0; i < ncopy; i++) {
+ cheight = copy_height;
+ if (((cheight * pitch) >> 2) > 0x000fffff) {
+ cheight = (0x000fffff << 2) / pitch;
+ }
+ size = (cheight * pitch) >> 2;
+ /* emit reloc before writting cs so that cs is always in consistent state */
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ);
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE);
+ cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size);
+ cs->buf[cs->cdw++] = base >> 8;
+ cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+ (lbpp << 24) | (bank_h << 21) |
+ (bank_w << 18) | (mt_aspect << 16);
+ cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 16);
+ cs->buf[cs->cdw++] = (slice_tile_max << 0);
+ cs->buf[cs->cdw++] = (x << 0) | (z << 18);
+ cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 25);
+ cs->buf[cs->cdw++] = addr & 0xfffffffc;
+ cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+ copy_height -= cheight;
+ addr += cheight * pitch;
+ y += cheight;
+ }
+}
+
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct r600_context *rctx = (struct r600_context *)ctx;
+ struct r600_texture *rsrc = (struct r600_texture*)src;
+ struct r600_texture *rdst = (struct r600_texture*)dst;
+ unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+ unsigned src_w, dst_w;
+
+ if (rctx->rings.dma.cs == NULL) {
+ return FALSE;
+ }
+ if (src->format != dst->format) {
+ return FALSE;
+ }
+
+ bpp = rdst->surface.bpe;
+ dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+ src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+ src_w = rsrc->surface.level[src_level].npix_x;
+ dst_w = rdst->surface.level[dst_level].npix_x;
+ copy_height = src_box->height / rsrc->surface.blk_h;
+
+ dst_mode = rdst->surface.level[dst_level].mode;
+ src_mode = rsrc->surface.level[src_level].mode;
+ /* downcast linear aligned to linear to simplify test */
+ src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+ dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+ if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+ /* FIXME evergreen can do partial blit */
+ return FALSE;
+ }
+ /* the x test here are currently useless (because we don't support partial blit)
+ * but keep them around so we don't forget about those
+ */
+ if ((src_pitch & 0x7) || (src_box->x & 0x7) || (dst_x & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
+ return FALSE;
+ }
+
+ if (src_mode == dst_mode) {
+ unsigned long dst_offset, src_offset;
+ /* simple dma blit would do NOTE code here assume :
+ * src_box.x/y == 0
+ * dst_x/y == 0
+ * dst_pitch == src_pitch
+ */
+ src_offset= rsrc->surface.level[src_level].offset;
+ src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+ src_offset += src_box->y * src_pitch + src_box->x * bpp;
+ dst_offset = rdst->surface.level[dst_level].offset;
+ dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+ dst_offset += dst_y * dst_pitch + dst_x * bpp;
+ evergreen_dma_copy(rctx, dst, src, dst_offset, src_offset,
+ src_box->height * src_pitch);
+ } else {
+ evergreen_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
+ src, src_level, src_box->x, src_box->y, src_box->z,
+ copy_height, dst_pitch, bpp);
+ }
+ return TRUE;
+}
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index d9dba959bd5..12c7ed14095 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -2317,4 +2317,19 @@
#define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1)
#define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF
+/* async DMA packets */
+#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \
+ (((sub_cmd) & 0xFF) << 20) |\
+ (((n) & 0xFFFFF) << 0))
+/* async DMA Packet types */
+#define DMA_PACKET_WRITE 0x2
+#define DMA_PACKET_COPY 0x3
+#define DMA_PACKET_INDIRECT_BUFFER 0x4
+#define DMA_PACKET_SEMAPHORE 0x5
+#define DMA_PACKET_FENCE 0x6
+#define DMA_PACKET_TRAP 0x7
+#define DMA_PACKET_SRBM_WRITE 0x9
+#define DMA_PACKET_CONSTANT_FILL 0xd
+#define DMA_PACKET_NOP 0xf
+
#endif
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 2734594987e..a383c908f64 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -170,6 +170,33 @@ void r600_flush_emit(struct r600_context *ctx);
void r600_context_streamout_begin(struct r600_context *ctx);
void r600_context_streamout_end(struct r600_context *ctx);
void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw);
+void r600_dma_copy(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ struct pipe_resource *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long size);
+boolean r600_dma_blit(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
+void evergreen_dma_copy(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ struct pipe_resource *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long size);
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block, unsigned pkt_flags);
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t dst_offset,
diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
index be171f8850a..6df0d91a56c 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -27,6 +27,7 @@
#include "r600_pipe.h"
#include "util/u_upload_mgr.h"
#include "util/u_memory.h"
+#include "util/u_surface.h"
static void r600_buffer_destroy(struct pipe_screen *screen,
struct pipe_resource *buf)
@@ -179,13 +180,27 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
if (rtransfer->staging) {
- struct pipe_box box;
- u_box_1d(rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT,
- transfer->box.width, &box);
+ struct pipe_resource *dst, *src;
+ unsigned soffset, doffset, size;
+ dst = transfer->resource;
+ src = &rtransfer->staging->b.b;
+ size = transfer->box.width;
+ doffset = transfer->box.x;
+ soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
/* Copy the staging buffer into the original one. */
- r600_copy_buffer(pipe, transfer->resource, transfer->box.x,
- &rtransfer->staging->b.b, &box);
+ if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset)) {
+ if (rctx->screen->chip_class >= EVERGREEN) {
+ evergreen_dma_copy(rctx, dst, src, doffset, soffset, size);
+ } else {
+ r600_dma_copy(rctx, dst, src, doffset, soffset, size);
+ }
+ } else {
+ struct pipe_box box;
+
+ u_box_1d(soffset, size, &box);
+ r600_copy_buffer(pipe, dst, doffset, src, &box);
+ }
pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
}
util_slab_free(&rctx->pool_transfers, transfer);
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index b28827f2b29..61299ef40fb 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -762,8 +762,6 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
}
}
#endif
-
- r600_begin_new_cs(ctx);
}
void r600_begin_new_cs(struct r600_context *ctx)
@@ -1129,3 +1127,49 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
dst_offset += byte_count;
}
}
+
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
+{
+ /* The number of dwords we already used in the DMA so far. */
+ num_dw += ctx->rings.dma.cs->cdw;
+ /* Flush if there's not enough space. */
+ if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+ ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+ }
+}
+
+void r600_dma_copy(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ struct pipe_resource *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long size)
+{
+ struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+ unsigned i, ncopy, csize, shift;
+ struct r600_resource *rdst = (struct r600_resource*)dst;
+ struct r600_resource *rsrc = (struct r600_resource*)src;
+
+ /* make sure that the dma ring is only one active */
+ rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+ size >>= 2;
+ shift = 2;
+ ncopy = (size / 0xffff) + !!(size % 0xffff);
+
+ r600_need_dma_space(rctx, ncopy * 5);
+ for (i = 0; i < ncopy; i++) {
+ csize = size < 0xffff ? size : 0xffff;
+ /* emit reloc before writting cs so that cs is always in consistent state */
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ);
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE);
+ cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
+ cs->buf[cs->cdw++] = dst_offset & 0xfffffffc;
+ cs->buf[cs->cdw++] = src_offset & 0xfffffffc;
+ cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+ cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+ dst_offset += csize << shift;
+ src_offset += csize << shift;
+ size -= csize;
+ }
+}
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index c72ee8f8571..676741255f5 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -30,6 +30,7 @@
#include "util/u_memory.h"
#include "util/u_simple_shaders.h"
#include "util/u_upload_mgr.h"
+#include "util/u_math.h"
#include "vl/vl_decoder.h"
#include "vl/vl_video_buffer.h"
#include "os/os_time.h"
@@ -128,12 +129,13 @@ static void r600_flush(struct pipe_context *ctx, unsigned flags)
}
r600_context_flush(rctx, flags);
+ rctx->rings.gfx.flushing = false;
+ r600_begin_new_cs(rctx);
/* Re-enable render condition. */
if (render_cond) {
ctx->render_condition(ctx, render_cond, render_cond_mode);
}
- rctx->rings.gfx.flushing = false;
}
static void r600_flush_from_st(struct pipe_context *ctx,
@@ -1111,8 +1113,10 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
if (rscreen->chip_class >= EVERGREEN) {
rscreen->screen.is_format_supported = evergreen_is_format_supported;
+ rscreen->dma_blit = &evergreen_dma_blit;
} else {
rscreen->screen.is_format_supported = r600_is_format_supported;
+ rscreen->dma_blit = &r600_dma_blit;
}
rscreen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
rscreen->screen.context_create = r600_create_context;
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 5cb080579f0..31dcd057265 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -220,6 +220,14 @@ enum r600_msaa_texture_mode {
MSAA_TEXTURE_COMPRESSED
};
+typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
+
struct r600_screen {
struct pipe_screen screen;
struct radeon_winsys *ws;
@@ -243,6 +251,7 @@ struct r600_screen {
uint32_t *trace_ptr;
unsigned cs_count;
#endif
+ r600g_dma_blit_t dma_blit;
};
struct r600_pipe_sampler_view {
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index da249c247bc..6b4b2c3819a 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2945,3 +2945,193 @@ void r600_update_db_shader_control(struct r600_context * rctx)
rctx->db_misc_state.atom.dirty = true;
}
}
+
+static INLINE unsigned r600_array_mode(unsigned mode)
+{
+ switch (mode) {
+ case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_0280A0_ARRAY_LINEAR_ALIGNED;
+ break;
+ case RADEON_SURF_MODE_1D: return V_0280A0_ARRAY_1D_TILED_THIN1;
+ break;
+ case RADEON_SURF_MODE_2D: return V_0280A0_ARRAY_2D_TILED_THIN1;
+ default:
+ case RADEON_SURF_MODE_LINEAR: return V_0280A0_ARRAY_LINEAR_GENERAL;
+ }
+}
+
+static boolean r600_dma_copy_tile(struct r600_context *rctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x,
+ unsigned dst_y,
+ unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ unsigned src_x,
+ unsigned src_y,
+ unsigned src_z,
+ unsigned copy_height,
+ unsigned pitch,
+ unsigned bpp)
+{
+ struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+ struct r600_texture *rsrc = (struct r600_texture*)src;
+ struct r600_texture *rdst = (struct r600_texture*)dst;
+ unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+ unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+ unsigned long base, addr;
+
+ /* make sure that the dma ring is only one active */
+ rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+ dst_mode = rdst->surface.level[dst_level].mode;
+ src_mode = rsrc->surface.level[src_level].mode;
+ /* downcast linear aligned to linear to simplify test */
+ src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+ dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+ assert(dst_mode != src_mode);
+
+ y = 0;
+ lbpp = util_logbase2(bpp);
+ pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+
+ if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+ /* T2L */
+ array_mode = r600_array_mode(src_mode);
+ slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+ /* linear height must be the same as the slice tile max height, it's ok even
+ * if the linear destination/source have smaller heigh as the size of the
+ * dma packet will be using the copy_height which is always smaller or equal
+ * to the linear height
+ */
+ height = rsrc->surface.level[src_level].npix_y;
+ detile = 1;
+ x = src_x;
+ y = src_y;
+ z = src_z;
+ base = rsrc->surface.level[src_level].offset;
+ addr = rdst->surface.level[dst_level].offset;
+ addr += rdst->surface.level[dst_level].slice_size * dst_z;
+ addr += dst_y * pitch + dst_x * bpp;
+ } else {
+ /* L2T */
+ array_mode = r600_array_mode(dst_mode);
+ slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+ /* linear height must be the same as the slice tile max height, it's ok even
+ * if the linear destination/source have smaller heigh as the size of the
+ * dma packet will be using the copy_height which is always smaller or equal
+ * to the linear height
+ */
+ height = rdst->surface.level[dst_level].npix_y;
+ detile = 0;
+ x = dst_x;
+ y = dst_y;
+ z = dst_z;
+ base = rdst->surface.level[dst_level].offset;
+ addr = rsrc->surface.level[src_level].offset;
+ addr += rsrc->surface.level[src_level].slice_size * src_z;
+ addr += src_y * pitch + src_x * bpp;
+ }
+ /* check that we are in dw/base alignment constraint */
+ if ((addr & 0x3) || (base & 0xff)) {
+ return FALSE;
+ }
+
+ size = (copy_height * pitch) >> 2;
+ ncopy = (size / 0x0000ffff) + !!(size % 0x0000ffff);
+ r600_need_dma_space(rctx, ncopy * 7);
+ for (i = 0; i < ncopy; i++) {
+ cheight = copy_height;
+ if (((cheight * pitch) >> 2) > 0x0000ffff) {
+ cheight = (0x0000ffff << 2) / pitch;
+ }
+ size = (cheight * pitch) >> 2;
+ /* emit reloc before writting cs so that cs is always in consistent state */
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ);
+ r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE);
+ cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
+ cs->buf[cs->cdw++] = base >> 8;
+ cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+ (lbpp << 24) | ((height - 1) << 10) |
+ pitch_tile_max;
+ cs->buf[cs->cdw++] = (slice_tile_max << 12) | (z << 0);
+ cs->buf[cs->cdw++] = (x << 3) | (y << 17);
+ cs->buf[cs->cdw++] = addr & 0xfffffffc;
+ cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+ copy_height -= cheight;
+ addr += cheight * pitch;
+ y += cheight;
+ }
+ return TRUE;
+}
+
+boolean r600_dma_blit(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct r600_context *rctx = (struct r600_context *)ctx;
+ struct r600_texture *rsrc = (struct r600_texture*)src;
+ struct r600_texture *rdst = (struct r600_texture*)dst;
+ unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+ unsigned src_w, dst_w;
+
+ if (rctx->rings.dma.cs == NULL) {
+ return FALSE;
+ }
+ if (src->format != dst->format) {
+ return FALSE;
+ }
+
+ bpp = rdst->surface.bpe;
+ dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+ src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+ src_w = rsrc->surface.level[src_level].npix_x;
+ dst_w = rdst->surface.level[dst_level].npix_x;
+ copy_height = src_box->height / rsrc->surface.blk_h;
+
+ dst_mode = rdst->surface.level[dst_level].mode;
+ src_mode = rsrc->surface.level[src_level].mode;
+ /* downcast linear aligned to linear to simplify test */
+ src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+ dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+ if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+ /* strick requirement on r6xx/r7xx */
+ return FALSE;
+ }
+ /* lot of constraint on alignment this should capture them all */
+ if ((src_pitch & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
+ return FALSE;
+ }
+
+ if (src_mode == dst_mode) {
+ unsigned long dst_offset, src_offset, size;
+
+ /* simple dma blit would do NOTE code here assume :
+ * src_box.x/y == 0
+ * dst_x/y == 0
+ * dst_pitch == src_pitch
+ */
+ src_offset= rsrc->surface.level[src_level].offset;
+ src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+ src_offset += src_box->y * src_pitch + src_box->x * bpp;
+ dst_offset = rdst->surface.level[dst_level].offset;
+ dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+ dst_offset += dst_y * dst_pitch + dst_x * bpp;
+ size = src_box->height * src_pitch;
+ /* must be dw aligned */
+ if ((dst_offset & 0x3) || (src_offset & 0x3) || (size & 0x3)) {
+ return FALSE;
+ }
+ r600_dma_copy(rctx, dst, src, dst_offset, src_offset, size);
+ } else {
+ return r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
+ src, src_level, src_box->x, src_box->y, src_box->z,
+ copy_height, dst_pitch, bpp);
+ }
+ return TRUE;
+}
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index c7f672f748c..b547d647cba 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1273,6 +1273,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
return;
}
+ /* make sure that the gfx ring is only one active */
+ rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
+
if (!r600_update_derived_state(rctx)) {
/* useless to render because current rendering command
* can't be achieved
@@ -1280,9 +1283,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
return;
}
- /* make sure that the gfx ring is only one active */
- rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
-
if (info.indexed) {
/* Initialize the index buffer struct. */
pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index b0c55145f1f..96c3729bc08 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -35,13 +35,19 @@
/* Copy from a full GPU texture to a transfer's staging one. */
static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
{
+ struct r600_context *rctx = (struct r600_context*)ctx;
struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
struct pipe_resource *dst = &rtransfer->staging->b.b;
struct pipe_resource *src = transfer->resource;
if (src->nr_samples <= 1) {
- ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
- src, transfer->level, &transfer->box);
+ if (!rctx->screen->dma_blit(ctx, dst, 0, 0, 0, 0,
+ src, transfer->level,
+ &transfer->box)) {
+ /* async dma could not be use */
+ ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
+ src, transfer->level, &transfer->box);
+ }
} else {
/* Resolve the resource. */
struct pipe_blit_info blit;
@@ -66,16 +72,22 @@ static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_t
/* Copy from a transfer's staging texture to a full GPU one. */
static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
{
+ struct r600_context *rctx = (struct r600_context*)ctx;
struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
struct pipe_resource *texture = transfer->resource;
struct pipe_box sbox;
u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
- ctx->resource_copy_region(ctx, texture, transfer->level,
- transfer->box.x, transfer->box.y, transfer->box.z,
- &rtransfer->staging->b.b,
- 0, &sbox);
+ if (!rctx->screen->dma_blit(ctx, texture, transfer->level,
+ transfer->box.x, transfer->box.y, transfer->box.z,
+ &rtransfer->staging->b.b, 0, &sbox)) {
+ /* async dma could not be use */
+ ctx->resource_copy_region(ctx, texture, transfer->level,
+ transfer->box.x, transfer->box.y, transfer->box.z,
+ &rtransfer->staging->b.b,
+ 0, &sbox);
+ }
}
unsigned r600_texture_get_offset(struct r600_texture *rtex,
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index dd64aca3d51..621e7a10e78 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -3681,4 +3681,19 @@
#define SQ_TEX_INST_SAMPLE_C_G_LB 0x1E
#define SQ_TEX_INST_SAMPLE_C_G_LZ 0x1F
+/* async DMA packets */
+#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \
+ (((t) & 0x1) << 23) | \
+ (((s) & 0x1) << 22) | \
+ (((n) & 0xFFFF) << 0))
+/* async DMA Packet types */
+#define DMA_PACKET_WRITE 0x2
+#define DMA_PACKET_COPY 0x3
+#define DMA_PACKET_INDIRECT_BUFFER 0x4
+#define DMA_PACKET_SEMAPHORE 0x5
+#define DMA_PACKET_FENCE 0x6
+#define DMA_PACKET_TRAP 0x7
+#define DMA_PACKET_CONSTANT_FILL 0xd /* 7xx only */
+#define DMA_PACKET_NOP 0xf
+
#endif