aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/freedreno/freedreno_util.h
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2018-10-21 10:22:11 -0400
committerRob Clark <[email protected]>2018-10-26 18:10:00 -0400
commitf3cc0d2747568a186dba433ac94af607c38fa024 (patch)
treea6db86fd1cfbb044180d3aba07e047714e6b9f1d /src/gallium/drivers/freedreno/freedreno_util.h
parentaa02d7e8781c25ee18b6da97606300808c84973a (diff)
freedreno: import libdrm_freedreno + redesign submit
In the pursuit of lowering driver overhead, it became clear that some amount of redesign of how libdrm_freedreno constructs the submit ioctl would be needed. In particular, as the gallium driver is starting to make heavier use of CP_SET_DRAW_STATE state groups/objects, the over- head of tracking cmd buffers and relocs becomes too much. And for "streaming" state, which isn't ever reused (like uniform uploads) the overhead of allocating/freeing ringbuffer[1] objects is too high. This redesign makes two main changes: 1) Introduces a fd_submit object for tracking bos and cmds table for the submit ioctl, making ringbuffer objects more light- weight. This was previously done in the ringbuffer. But we have many ringbuffer instances involved in a submit (gmem + draw + potentially 1000's of state-group rbs), and only need a single bos and cmds table. (Reloc table is still per-rb) The submit is also a convenient place for a slab allocator for ringbuffer objects. Other options would have required locking because, while we can guarantee allocations will only happen on a single thread, free's could happen either on the application thread or the flush_queue thread. With the slab allocator in the submit object, any frees that happen on the flush_queue thread happen after we know that the application thread is done with the submit. 2) Introduce a new "softpin" msm_ringbuffer_sp implementation that does not use relocs and only has cmds table entries for IB1 (ie. the cmdstream buffers that kernel needs to CP_INDIRECT_BUFFER to from the RB). To do this properly will require some updates on the kernel side, so whether you get the softpin or legacy submit/ringbuffer implementation at runtime depends on your kernel version. To make all these changes in libdrm would basically require adding a libdrm_freedreno2, so this is a good point to just pull the libdrm code into mesa. Plus it allows for using mesa's hashtable, slab allocator, etc. And it lets us have asserts enabled for debug mesa buids but omitted for release builds. And it makes life easier if further API changes become necessary. At this point I haven't tried to pull in the kgsl backend. Although I left the level of vfunc indirection which would make it possible to have other backends. (And this was convenient to keep to allow for the "softpin" ringbuffer to coexist.) NOTE: if bisecting a build error takes you here, try a clean build. There are a bunch of ways things can go wrong if you still have libdrm_freedreno cflags. [1] "ringbuffer" is probably a bad name, the only level of cmdstream buffer that is actually a ring is RB managed by kernel. User- space cmdstream is all IB1/IB2 and state-groups. Reviewed-by: Kristian H. Kristensen <[email protected]> Reviewed-by: Eric Engestrom <[email protected]> Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/freedreno_util.h')
-rw-r--r--src/gallium/drivers/freedreno/freedreno_util.h30
1 files changed, 9 insertions, 21 deletions
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 82bcb9b33f0..81622506f1e 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -27,8 +27,8 @@
#ifndef FREEDRENO_UTIL_H_
#define FREEDRENO_UTIL_H_
-#include <freedreno_drmif.h>
-#include <freedreno_ringbuffer.h>
+#include "drm/freedreno_drmif.h"
+#include "drm/freedreno_ringbuffer.h"
#include "pipe/p_format.h"
#include "pipe/p_state.h"
@@ -84,6 +84,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
#define FD_DBG_HIPRIO 0x100000
#define FD_DBG_TTILE 0x200000
#define FD_DBG_PERFC 0x400000
+#define FD_DBG_SOFTPIN 0x800000
extern int fd_mesa_debug;
extern bool fd_binning_enabled;
@@ -202,7 +203,7 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
{
if (LOG_DWORDS) {
DBG("ring[%p]: OUT_RING %04x: %08x", ring,
- (uint32_t)(ring->cur - ring->last_start), data);
+ (uint32_t)(ring->cur - ring->start), data);
}
fd_ringbuffer_emit(ring, data);
}
@@ -214,7 +215,7 @@ OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data,
{
if (LOG_DWORDS) {
DBG("ring[%p]: OUT_RINGP %04x: %08x", ring,
- (uint32_t)(ring->cur - ring->last_start), data);
+ (uint32_t)(ring->cur - ring->start), data);
}
util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){
.cs = ring->cur++,
@@ -232,10 +233,10 @@ OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
{
if (LOG_DWORDS) {
DBG("ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring,
- (uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
+ (uint32_t)(ring->cur - ring->start), bo, offset, shift);
}
debug_assert(offset < fd_bo_size(bo));
- fd_ringbuffer_reloc2(ring, &(struct fd_reloc){
+ fd_ringbuffer_reloc(ring, &(struct fd_reloc){
.bo = bo,
.flags = FD_RELOC_READ,
.offset = offset,
@@ -251,10 +252,10 @@ OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo,
{
if (LOG_DWORDS) {
DBG("ring[%p]: OUT_RELOCW %04x: %p+%u << %d", ring,
- (uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
+ (uint32_t)(ring->cur - ring->start), bo, offset, shift);
}
debug_assert(offset < fd_bo_size(bo));
- fd_ringbuffer_reloc2(ring, &(struct fd_reloc){
+ fd_ringbuffer_reloc(ring, &(struct fd_reloc){
.bo = bo,
.flags = FD_RELOC_READ | FD_RELOC_WRITE,
.offset = offset,
@@ -276,18 +277,9 @@ static inline void BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
fd_ringbuffer_grow(ring, ndwords);
}
-static inline uint32_t
-__gpu_id(struct fd_ringbuffer *ring)
-{
- uint64_t val;
- fd_pipe_get_param(ring->pipe, FD_GPU_ID, &val);
- return val;
-}
-
static inline void
OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
{
- debug_assert(__gpu_id(ring) < 500);
BEGIN_RING(ring, cnt+1);
OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
}
@@ -295,7 +287,6 @@ OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
static inline void
OUT_PKT2(struct fd_ringbuffer *ring)
{
- debug_assert(__gpu_id(ring) < 500);
BEGIN_RING(ring, 1);
OUT_RING(ring, CP_TYPE2_PKT);
}
@@ -303,7 +294,6 @@ OUT_PKT2(struct fd_ringbuffer *ring)
static inline void
OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
{
- debug_assert(__gpu_id(ring) < 500);
BEGIN_RING(ring, cnt+1);
OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
}
@@ -366,8 +356,6 @@ __OUT_IB(struct fd_ringbuffer *ring, bool prefetch, struct fd_ringbuffer *target
unsigned count = fd_ringbuffer_cmd_count(target);
- debug_assert(__gpu_id(ring) < 500);
-
/* for debug after a lock up, write a unique counter value
* to scratch6 for each IB, to make it easier to match up
* register dumps to cmdstream. The combination of IB and