aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--common.py1
-rwxr-xr-xscons/gallium.py12
-rw-r--r--src/mesa/swrast/s_aatritemp.h72
-rw-r--r--src/mesa/swrast/s_context.c26
-rw-r--r--src/mesa/swrast/s_texcombine.c4
-rw-r--r--src/mesa/tnl/t_pipeline.c12
6 files changed, 91 insertions, 36 deletions
diff --git a/common.py b/common.py
index 8657030ea3f..cfee1b5dc2e 100644
--- a/common.py
+++ b/common.py
@@ -88,6 +88,7 @@ def AddOptions(opts):
opts.Add('toolchain', 'compiler toolchain', default_toolchain)
opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
+ opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no'))
opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
diff --git a/scons/gallium.py b/scons/gallium.py
index 8cd3bc7f6e0..7135251d7a3 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -596,6 +596,18 @@ def generate(env):
libs += ['m', 'pthread', 'dl']
env.Append(LIBS = libs)
+ # OpenMP
+ if env['openmp']:
+ if env['msvc']:
+ env.Append(CCFLAGS = ['/openmp'])
+ # When building openmp release VS2008 link.exe crashes with LNK1103 error.
+ # Workaround: overwrite PDB flags with empty value as it isn't required anyways
+ if env['build'] == 'release':
+ env['PDB'] = ''
+ if env['gcc']:
+ env.Append(CCFLAGS = ['-fopenmp'])
+ env.Append(LIBS = ['gomp'])
+
# Load tools
env.Tool('lex')
env.Tool('yacc')
diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h
index 91d4f7a10ab..77b3ae6ec7a 100644
--- a/src/mesa/swrast/s_aatritemp.h
+++ b/src/mesa/swrast/s_aatritemp.h
@@ -181,13 +181,20 @@
const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
const GLfloat dxdy = majDx / majDy;
const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
- GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
GLint iy;
- for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+ for (iy = iyMin; iy < iyMax; iy++) {
+ GLfloat x = pMin[0] - (yMin - iy) * dxdy;
GLint ix, startX = (GLint) (x - xAdj);
GLuint count;
GLfloat coverage = 0.0F;
+#ifdef _OPENMP
+ /* each thread needs to use a different (global) SpanArrays variable */
+ span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
/* skip over fragments with zero coverage */
while (startX < MAX_WIDTH) {
coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
@@ -228,13 +235,12 @@
coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
}
- if (ix <= startX)
- continue;
-
- span.x = startX;
- span.y = iy;
- span.end = (GLuint) ix - (GLuint) startX;
- _swrast_write_rgba_span(ctx, &span);
+ if (ix > startX) {
+ span.x = startX;
+ span.y = iy;
+ span.end = (GLuint) ix - (GLuint) startX;
+ _swrast_write_rgba_span(ctx, &span);
+ }
}
}
else {
@@ -244,13 +250,20 @@
const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
const GLfloat dxdy = majDx / majDy;
const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
- GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
GLint iy;
- for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+ for (iy = iyMin; iy < iyMax; iy++) {
+ GLfloat x = pMin[0] - (yMin - iy) * dxdy;
GLint ix, left, startX = (GLint) (x + xAdj);
GLuint count, n;
GLfloat coverage = 0.0F;
+#ifdef _OPENMP
+ /* each thread needs to use a different (global) SpanArrays variable */
+ span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
/* make sure we're not past the window edge */
if (startX >= ctx->DrawBuffer->_Xmax) {
startX = ctx->DrawBuffer->_Xmax - 1;
@@ -296,31 +309,30 @@
ATTRIB_LOOP_END
#endif
- if (startX <= ix)
- continue;
-
- n = (GLuint) startX - (GLuint) ix;
+ if (startX > ix) {
+ n = (GLuint) startX - (GLuint) ix;
- left = ix + 1;
+ left = ix + 1;
- /* shift all values to the left */
- /* XXX this is temporary */
- {
- SWspanarrays *array = span.array;
- GLint j;
- for (j = 0; j < (GLint) n; j++) {
- array->coverage[j] = array->coverage[j + left];
- COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
+ /* shift all values to the left */
+ /* XXX this is temporary */
+ {
+ SWspanarrays *array = span.array;
+ GLint j;
+ for (j = 0; j < (GLint) n; j++) {
+ array->coverage[j] = array->coverage[j + left];
+ COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
#ifdef DO_Z
- array->z[j] = array->z[j + left];
+ array->z[j] = array->z[j + left];
#endif
+ }
}
- }
- span.x = left;
- span.y = iy;
- span.end = n;
- _swrast_write_rgba_span(ctx, &span);
+ span.x = left;
+ span.y = iy;
+ span.end = n;
+ _swrast_write_rgba_span(ctx, &span);
+ }
}
}
}
diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
index def1531d7ff..4434f11b990 100644
--- a/src/mesa/swrast/s_context.c
+++ b/src/mesa/swrast/s_context.c
@@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
{
GLuint i;
SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
+#ifdef _OPENMP
+ const GLint maxThreads = omp_get_max_threads();
+#else
+ const GLint maxThreads = 1;
+#endif
if (SWRAST_DEBUG) {
_mesa_debug(ctx, "_swrast_CreateContext\n");
@@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
swrast->TextureSample[i] = NULL;
- swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
+ /* SpanArrays is global and shared by all SWspan instances. However, when
+ * using multiple threads, it is necessary to have one SpanArrays instance
+ * per thread.
+ */
+ swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
if (!swrast->SpanArrays) {
FREE(swrast);
return GL_FALSE;
}
- swrast->SpanArrays->ChanType = CHAN_TYPE;
+ for(i = 0; i < maxThreads; i++) {
+ swrast->SpanArrays[i].ChanType = CHAN_TYPE;
#if CHAN_TYPE == GL_UNSIGNED_BYTE
- swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
+ swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
#elif CHAN_TYPE == GL_UNSIGNED_SHORT
- swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
+ swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
#else
- swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
+ swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
#endif
+ }
/* init point span buffer */
swrast->PointSpan.primitive = GL_POINT;
@@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
swrast->PointSpan.facing = 0;
swrast->PointSpan.array = swrast->SpanArrays;
- swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
+ /* TexelBuffer is also global and normally shared by all SWspan instances;
+ * when running with multiple threads, create one per thread.
+ */
+ swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
MAX_WIDTH * 4 * sizeof(GLfloat));
if (!swrast->TexelBuffer) {
FREE(swrast->SpanArrays);
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 086ed0b33d7..80b9dff3cc2 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
static INLINE float4_array
get_texel_array(SWcontext *swrast, GLuint unit)
{
+#ifdef _OPENMP
+ return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
+#else
return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
+#endif
}
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
index 18f095f0d4b..881d5d5f535 100644
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
_tnl_notify_pipeline_output_change( ctx );
}
+#ifndef _OPENMP
+ /* Don't adjust FPU precision mode in case multiple threads are to be used.
+ * This would require that the additional threads also changed the FPU mode
+ * which is quite a mess as this had to be done in all parallelized sections;
+ * otherwise the master thread and all other threads are running in different
+ * modes, producing inconsistent results.
+ * Note that all x64 implementations don't define/use START_FAST_MATH, so
+ * this is "hack" is only used in i386 mode
+ */
START_FAST_MATH(__tmp);
+#endif
for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
@@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
break;
}
+#ifndef _OPENMP
END_FAST_MATH(__tmp);
+#endif
}