4 files changed, 118 insertions, 49 deletions
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 2af4d311f60..61adc7ed988 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -211,14 +211,21 @@ struct r600_reloc {
  */
 struct r600_query {
 	u64					result;
-	/* The kind of query. Currently only OQ is supported. */
+	/* The kind of query */
 	unsigned				type;
-	/* How many results have been written, in dwords. It's incremented
-	 * after end_query and flush. */
-	unsigned				num_results;
-	/* if we've flushed the query */
+	/* Offset of the first result for current query */
+	unsigned				results_start;
+	/* Offset of the next free result after current query data */
+	unsigned				results_end;
+	/* Size of the result */
+	unsigned				result_size;
+	/* Count of new queries started in one stream without flushing */
+	unsigned				queries_emitted;
+	/* State flags */
 	unsigned				state;
-	/* The buffer where query results are stored. */
+	/* The buffer where query results are stored. It's used as a ring,
+	 * data blocks for current query are stored sequentially from
+	 * results_start to results_end, with wrapping on the buffer end */
 	struct r600_bo			*buffer;
 	unsigned				buffer_size;
 	/* linked list of queries */
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index 174505c75e9..de1f5d05f4e 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -43,7 +43,7 @@ static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 	struct r600_query *rquery = (struct r600_query *)query;
 
 	rquery->result = 0;
-	rquery->num_results = 0;
+	rquery->results_start = rquery->results_end;
 	r600_query_begin(&rctx->ctx, (struct r600_query *)query);
 }
 
@@ -72,12 +72,18 @@ static void r600_render_condition(struct pipe_context *ctx,
 	struct r600_query *rquery = (struct r600_query *)query;
 	int wait_flag = 0;
 
+	/* If we already have nonzero result, render unconditionally */
+	if (query != NULL && rquery->result != 0)
+		return;
+
 	rctx->current_render_cond = query;
 	rctx->current_render_cond_mode = mode;
 
-	if (!query) {
-		rctx->ctx.predicate_drawing = false;
-		r600_query_predication(&rctx->ctx, NULL, PREDICATION_OP_CLEAR, 1);
+	if (query == NULL) {
+		if (rctx->ctx.predicate_drawing) {
+			rctx->ctx.predicate_drawing = false;
+			r600_query_predication(&rctx->ctx, NULL, PREDICATION_OP_CLEAR, 1);
+		}
 		return;
 	}
 
@@ -88,7 +94,6 @@ static void r600_render_condition(struct pipe_context *ctx,
 
 	rctx->ctx.predicate_drawing = true;
 	r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
-	
 }
 
 void r600_init_query_functions(struct r600_pipe_context *rctx)
diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c
index a2f13ff0863..922367d85f1 100644
--- a/src/gallium/winsys/r600/drm/r600_hw_context.c
+++ b/src/gallium/winsys/r600/drm/r600_hw_context.c
@@ -1695,10 +1695,9 @@ out_err:
 
 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
 {
+	unsigned results_base = query->results_start;
 	u64 start, end;
-	u32 *results;
-	int i;
-	int size;
+	u32 *results, *current_result;
 
 	if (wait)
 		results = r600_bo_map(ctx->radeon, query->buffer, PB_USAGE_CPU_READ, NULL);
@@ -1707,25 +1706,31 @@ static boolean r600_query_result(struct r600_context *ctx, struct r600_query *qu
 	if (!results)
 		return FALSE;
 
-	/* query->num_results contains how many dwords were used for the query */
-	size = query->num_results;
-	for (i = 0; i < size; i += 4) {
-		start = (u64)results[i] | (u64)results[i + 1] << 32;
-		end = (u64)results[i + 2] | (u64)results[i + 3] << 32;
+
+	/* count all results across all data blocks */
+	while (results_base != query->results_end) {
+		current_result = (u32*)((char*)results + results_base);
+
+		start = (u64)current_result[0] | (u64)current_result[1] << 32;
+		end = (u64)current_result[2] | (u64)current_result[3] << 32;
 		if (((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))
                     || query->type == PIPE_QUERY_TIME_ELAPSED) {
 			query->result += end - start;
 		}
+
+		results_base += 4 * 4;
+		if (results_base >= query->buffer_size)
+			results_base = 0;
 	}
-	r600_bo_unmap(ctx->radeon, query->buffer);
-	query->num_results = 0;
 
+	query->results_start = query->results_end;
+	r600_bo_unmap(ctx->radeon, query->buffer);
 	return TRUE;
 }
 
 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 {
-	unsigned required_space, required_buffer;
+	unsigned required_space, new_results_end;
 	int num_backends = r600_get_num_backends(ctx->radeon);
 
 	/* query request needs 6/8 dwords for begin + 6/8 dwords for end */
@@ -1739,26 +1744,41 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 		r600_context_flush(ctx);
 	}
 
-	required_buffer = query->num_results +
-		4 * (query->type == PIPE_QUERY_OCCLUSION_COUNTER ? ctx->max_db : 1);
+	/* if it's new OQ (not resume) */
+	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER &&
+		query->results_start == query->results_end) {
+		/* Count queries emitted without flushes, and flush if more than
+		 * half of buffer used, to avoid overwriting results which may be
+		 * still in use. */
+		if (query->state & R600_QUERY_STATE_FLUSHED) {
+			query->queries_emitted = 1;
+		} else {
+			if (++query->queries_emitted > query->buffer_size / query->result_size / 2)
+				r600_context_flush(ctx);
+		}
+	}
+
+	new_results_end = query->results_end + query->result_size;
+	if (new_results_end > query->buffer_size)
+		new_results_end = 0;
 
-	/* if query buffer is full force a flush */
-	if (required_buffer*4 > query->buffer_size) {
+	/* collect current results if query buffer is full */
+	if (new_results_end == query->results_start) {
 		if (!(query->state & R600_QUERY_STATE_FLUSHED))
 			r600_context_flush(ctx);
 		r600_query_result(ctx, query, TRUE);
 	}
 
-	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER &&
-	    num_backends > 0) {
-		/* as per info on ZPASS the driver must set the unusued DB top bits */
+	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
 		u32 *results;
 		int i;
 
 		results = r600_bo_map(ctx->radeon, query->buffer, PB_USAGE_CPU_WRITE, NULL);
 		if (results) {
-			memset(results + query->num_results, 0, ctx->max_db * 4 * 4);
+			results = (u32*)((char*)results + query->results_end);
+			memset(results, 0, query->result_size);
 
+			/* Set top bits for unused backends */
 			for (i = num_backends; i < ctx->max_db; i++) {
 				results[(i * 4)+1] = 0x80000000;
 				results[(i * 4)+3] = 0x80000000;
@@ -1771,14 +1791,14 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 	if (query->type == PIPE_QUERY_TIME_ELAPSED) {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + r600_bo_offset(query->buffer);
 		ctx->pm4[ctx->pm4_cdwords++] = (3 << 29);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	} else {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + r600_bo_offset(query->buffer);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	}
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
@@ -1792,50 +1812,75 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 
 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
 {
-	/* emit begin query */
+	/* emit end query */
 	if (query->type == PIPE_QUERY_TIME_ELAPSED) {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + 8 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8 + r600_bo_offset(query->buffer);
 		ctx->pm4[ctx->pm4_cdwords++] = (3 << 29);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	} else {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + 8 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8 + r600_bo_offset(query->buffer);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	}
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
 	ctx->pm4[ctx->pm4_cdwords++] = 0;
 	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
 
-	query->num_results += 4 * (query->type == PIPE_QUERY_OCCLUSION_COUNTER ? ctx->max_db : 1);
+	query->results_end += query->result_size;
+	if (query->results_end >= query->buffer_size)
+		query->results_end = 0;
+
 	query->state ^= R600_QUERY_STATE_STARTED;
 	query->state |= R600_QUERY_STATE_ENDED;
 	query->state &= ~R600_QUERY_STATE_FLUSHED;
+
 	ctx->num_query_running--;
 }
 
 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
 			    int flag_wait)
 {
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
-
 	if (operation == PREDICATION_OP_CLEAR) {
+		if (ctx->pm4_cdwords + 3 > ctx->pm4_ndwords)
+			r600_context_flush(ctx);
+
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = PRED_OP(PREDICATION_OP_CLEAR);
 	} else {
-		int results_base = query->num_results - (4 * ctx->max_db);
+		unsigned results_base = query->results_start;
+		unsigned count;
+		u32 op;
 
-		if (results_base < 0)
-			results_base = 0;
+		/* find count of the query data blocks */
+		count = query->buffer_size + query->results_end - query->results_start;
+		if (count > query->buffer_size) count-=query->buffer_size;
+		count /= query->result_size;
 
-		ctx->pm4[ctx->pm4_cdwords++] = results_base*4 + r600_bo_offset(query->buffer);
-		ctx->pm4[ctx->pm4_cdwords++] = PRED_OP(operation) | (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW) | PREDICATION_DRAW_VISIBLE;
-		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-		ctx->pm4[ctx->pm4_cdwords++] = 0;
-		r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
+		if (ctx->pm4_cdwords + 5 * count > ctx->pm4_ndwords)
+			r600_context_flush(ctx);
+
+		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
+				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
+
+		/* emit predicate packets for all data blocks */
+		while (results_base != query->results_end) {
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = results_base + r600_bo_offset(query->buffer);
+			ctx->pm4[ctx->pm4_cdwords++] = op;
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = 0;
+			r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
+			results_base += query->result_size;
+			if (results_base >= query->buffer_size)
+				results_base = 0;
+			/* set CONTINUE bit for all packets except the first */
+			op |= PREDICATION_CONTINUE;
+		}
 	}
 }
 
@@ -1853,6 +1898,14 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
 	query->type = query_type;
 	query->buffer_size = 4096;
 
+	if (query_type == PIPE_QUERY_OCCLUSION_COUNTER)
+		query->result_size = 4 * 4 * ctx->max_db;
+	else
+		query->result_size = 4 * 4;
+
+	/* adjust buffer size to simplify offsets wrapping math */
+	query->buffer_size -= query->buffer_size % query->result_size;
+
 	/* As of GL4, query buffers are normally read by the CPU after
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
@@ -1882,7 +1935,7 @@ boolean r600_context_query_result(struct r600_context *ctx,
 {
 	uint64_t *result = (uint64_t*)vresult;
 
-	if (query->num_results && !(query->state & R600_QUERY_STATE_FLUSHED)) {
+	if (!(query->state & R600_QUERY_STATE_FLUSHED)) {
 		r600_context_flush(ctx);
 	}
 	if (!r600_query_result(ctx, query, wait))
@@ -1912,10 +1965,12 @@ void r600_context_queries_resume(struct r600_context *ctx, boolean flushed)
 	struct r600_query *query;
 
 	LIST_FOR_EACH_ENTRY(query, &ctx->query_list, list) {
+		if (flushed)
+			query->state |= R600_QUERY_STATE_FLUSHED;
+
 		if (query->state & R600_QUERY_STATE_SUSPENDED) {
 			r600_query_begin(ctx, query);
 			query->state ^= R600_QUERY_STATE_SUSPENDED;
-		} else if (flushed && query->state==R600_QUERY_STATE_ENDED)
-			query->state |= R600_QUERY_STATE_FLUSHED;
+		}
 	}
 }
diff --git a/src/gallium/winsys/r600/drm/r600d.h b/src/gallium/winsys/r600/drm/r600d.h
index 80424818044..4a19dcf8ddf 100644
--- a/src/gallium/winsys/r600/drm/r600d.h
+++ b/src/gallium/winsys/r600/drm/r600d.h
@@ -114,6 +114,8 @@
 
 #define PRED_OP(x) ((x) << 16)
 
+#define PREDICATION_CONTINUE (1 << 31)
+
 #define PREDICATION_HINT_WAIT (0 << 12)
 #define PREDICATION_HINT_NOWAIT_DRAW (1 << 12)