1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
|
/*
* Copyright 2015 Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* The GPU load is measured as follows.
*
* There is a thread which samples the GRBM_STATUS register at a certain
* frequency and the "busy" or "idle" counter is incremented based on
* whether the GUI_ACTIVE bit is set or not.
*
* Then, the user can sample the counters twice and calculate the average
* GPU load between the two samples.
*/
#include "radeonsi/si_pipe.h"
#include "radeonsi/si_query.h"
#include "util/os_time.h"
/* For good accuracy at 1000 fps or lower. This will be inaccurate for higher
* fps (there are too few samples per frame). */
#define SAMPLES_PER_SEC 10000
#define GRBM_STATUS 0x8010
#define TA_BUSY(x) (((x) >> 14) & 0x1)
#define GDS_BUSY(x) (((x) >> 15) & 0x1)
#define VGT_BUSY(x) (((x) >> 17) & 0x1)
#define IA_BUSY(x) (((x) >> 19) & 0x1)
#define SX_BUSY(x) (((x) >> 20) & 0x1)
#define WD_BUSY(x) (((x) >> 21) & 0x1)
#define SPI_BUSY(x) (((x) >> 22) & 0x1)
#define BCI_BUSY(x) (((x) >> 23) & 0x1)
#define SC_BUSY(x) (((x) >> 24) & 0x1)
#define PA_BUSY(x) (((x) >> 25) & 0x1)
#define DB_BUSY(x) (((x) >> 26) & 0x1)
#define CP_BUSY(x) (((x) >> 29) & 0x1)
#define CB_BUSY(x) (((x) >> 30) & 0x1)
#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
#define SRBM_STATUS2 0x0e4c
#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
#define CP_STAT 0x8680
#define PFP_BUSY(x) (((x) >> 15) & 0x1)
#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
#define ME_BUSY(x) (((x) >> 17) & 0x1)
#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
#define DMA_BUSY(x) (((x) >> 22) & 0x1)
#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
#define IDENTITY(x) x
#define UPDATE_COUNTER(field, mask) \
do { \
if (mask(value)) \
p_atomic_inc(&counters->named.field.busy); \
else \
p_atomic_inc(&counters->named.field.idle); \
} while (0)
static void si_update_mmio_counters(struct si_screen *sscreen,
union si_mmio_counters *counters)
{
uint32_t value = 0;
bool gui_busy, sdma_busy = false;
/* GRBM_STATUS */
sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
UPDATE_COUNTER(ta, TA_BUSY);
UPDATE_COUNTER(gds, GDS_BUSY);
UPDATE_COUNTER(vgt, VGT_BUSY);
UPDATE_COUNTER(ia, IA_BUSY);
UPDATE_COUNTER(sx, SX_BUSY);
UPDATE_COUNTER(wd, WD_BUSY);
UPDATE_COUNTER(spi, SPI_BUSY);
UPDATE_COUNTER(bci, BCI_BUSY);
UPDATE_COUNTER(sc, SC_BUSY);
UPDATE_COUNTER(pa, PA_BUSY);
UPDATE_COUNTER(db, DB_BUSY);
UPDATE_COUNTER(cp, CP_BUSY);
UPDATE_COUNTER(cb, CB_BUSY);
UPDATE_COUNTER(gui, GUI_ACTIVE);
gui_busy = GUI_ACTIVE(value);
if (sscreen->info.chip_class == CIK || sscreen->info.chip_class == VI) {
/* SRBM_STATUS2 */
sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
UPDATE_COUNTER(sdma, SDMA_BUSY);
sdma_busy = SDMA_BUSY(value);
}
if (sscreen->info.chip_class >= VI) {
/* CP_STAT */
sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
UPDATE_COUNTER(pfp, PFP_BUSY);
UPDATE_COUNTER(meq, MEQ_BUSY);
UPDATE_COUNTER(me, ME_BUSY);
UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
UPDATE_COUNTER(cp_dma, DMA_BUSY);
UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
}
value = gui_busy || sdma_busy;
UPDATE_COUNTER(gpu, IDENTITY);
}
#undef UPDATE_COUNTER
static int
si_gpu_load_thread(void *param)
{
struct si_screen *sscreen = (struct si_screen*)param;
const int period_us = 1000000 / SAMPLES_PER_SEC;
int sleep_us = period_us;
int64_t cur_time, last_time = os_time_get();
while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
if (sleep_us)
os_time_sleep(sleep_us);
/* Make sure we sleep the ideal amount of time to match
* the expected frequency. */
cur_time = os_time_get();
if (os_time_timeout(last_time, last_time + period_us,
cur_time))
sleep_us = MAX2(sleep_us - 1, 1);
else
sleep_us += 1;
/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
last_time = cur_time;
/* Update the counters. */
si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
}
p_atomic_dec(&sscreen->gpu_load_stop_thread);
return 0;
}
void si_gpu_load_kill_thread(struct si_screen *sscreen)
{
if (!sscreen->gpu_load_thread)
return;
p_atomic_inc(&sscreen->gpu_load_stop_thread);
thrd_join(sscreen->gpu_load_thread, NULL);
sscreen->gpu_load_thread = 0;
}
static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
unsigned busy_index)
{
/* Start the thread if needed. */
if (!sscreen->gpu_load_thread) {
mtx_lock(&sscreen->gpu_load_mutex);
/* Check again inside the mutex. */
if (!sscreen->gpu_load_thread)
sscreen->gpu_load_thread =
u_thread_create(si_gpu_load_thread, sscreen);
mtx_unlock(&sscreen->gpu_load_mutex);
}
unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
return busy | ((uint64_t)idle << 32);
}
static unsigned si_end_mmio_counter(struct si_screen *sscreen,
uint64_t begin, unsigned busy_index)
{
uint64_t end = si_read_mmio_counter(sscreen, busy_index);
unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
unsigned idle = (end >> 32) - (begin >> 32);
/* Calculate the % of time the busy counter was being incremented.
*
* If no counters were incremented, return the current counter status.
* It's for the case when the load is queried faster than
* the counters are updated.
*/
if (idle || busy) {
return busy*100 / (busy + idle);
} else {
union si_mmio_counters counters;
memset(&counters, 0, sizeof(counters));
si_update_mmio_counters(sscreen, &counters);
return counters.array[busy_index] ? 100 : 0;
}
}
#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
sscreen->mmio_counters.array)
static unsigned busy_index_from_type(struct si_screen *sscreen,
unsigned type)
{
switch (type) {
case SI_QUERY_GPU_LOAD:
return BUSY_INDEX(sscreen, gpu);
case SI_QUERY_GPU_SHADERS_BUSY:
return BUSY_INDEX(sscreen, spi);
case SI_QUERY_GPU_TA_BUSY:
return BUSY_INDEX(sscreen, ta);
case SI_QUERY_GPU_GDS_BUSY:
return BUSY_INDEX(sscreen, gds);
case SI_QUERY_GPU_VGT_BUSY:
return BUSY_INDEX(sscreen, vgt);
case SI_QUERY_GPU_IA_BUSY:
return BUSY_INDEX(sscreen, ia);
case SI_QUERY_GPU_SX_BUSY:
return BUSY_INDEX(sscreen, sx);
case SI_QUERY_GPU_WD_BUSY:
return BUSY_INDEX(sscreen, wd);
case SI_QUERY_GPU_BCI_BUSY:
return BUSY_INDEX(sscreen, bci);
case SI_QUERY_GPU_SC_BUSY:
return BUSY_INDEX(sscreen, sc);
case SI_QUERY_GPU_PA_BUSY:
return BUSY_INDEX(sscreen, pa);
case SI_QUERY_GPU_DB_BUSY:
return BUSY_INDEX(sscreen, db);
case SI_QUERY_GPU_CP_BUSY:
return BUSY_INDEX(sscreen, cp);
case SI_QUERY_GPU_CB_BUSY:
return BUSY_INDEX(sscreen, cb);
case SI_QUERY_GPU_SDMA_BUSY:
return BUSY_INDEX(sscreen, sdma);
case SI_QUERY_GPU_PFP_BUSY:
return BUSY_INDEX(sscreen, pfp);
case SI_QUERY_GPU_MEQ_BUSY:
return BUSY_INDEX(sscreen, meq);
case SI_QUERY_GPU_ME_BUSY:
return BUSY_INDEX(sscreen, me);
case SI_QUERY_GPU_SURF_SYNC_BUSY:
return BUSY_INDEX(sscreen, surf_sync);
case SI_QUERY_GPU_CP_DMA_BUSY:
return BUSY_INDEX(sscreen, cp_dma);
case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
return BUSY_INDEX(sscreen, scratch_ram);
default:
unreachable("invalid query type");
}
}
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
{
unsigned busy_index = busy_index_from_type(sscreen, type);
return si_read_mmio_counter(sscreen, busy_index);
}
unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
uint64_t begin)
{
unsigned busy_index = busy_index_from_type(sscreen, type);
return si_end_mmio_counter(sscreen, begin, busy_index);
}
|