3 files changed, 1105 insertions, 670 deletions
diff --git a/src/mesa/shader/slang/library/slang_builtin_vec4.gc b/src/mesa/shader/slang/library/slang_builtin_vec4.gc
index d549c0133a4..f075a886bd2 100755
--- a/src/mesa/shader/slang/library/slang_builtin_vec4.gc
+++ b/src/mesa/shader/slang/library/slang_builtin_vec4.gc
@@ -181,10 +181,10 @@ float dot (vec3 v, vec3 u) {
     return v4.x;
 }
 
-float dot (vec4 v, vec4 u) {
-    __asm vec4_dot v, u;
-    return v.x;
-}
+//float dot (vec4 v, vec4 u) {
+//    __asm vec4_dot v, u;
+//    return v.x;
+//}
 
 
 float length (vec3 v) {
@@ -199,14 +199,10 @@ float length (vec4 v) {
 }
 
 
-vec3 normalize (vec3 v) {
-    vec4 u = vec4 (v, 0.0);
-    vec4 w = u;
-    __asm vec4_dot      u, u;
-    float l = sqrt (u.x);
-    __asm float_to_vec4 u, l;
-    __asm vec4_divide   w, u;
-    return w.xyz;
+vec3 normalize (vec3 v)
+{
+   float s = inversesqrt(dot(v,v));  
+   __retVal = v * s;
 }
 
 vec4 normalize (vec4 v) {
diff --git a/src/mesa/shader/slang/library/slang_common_builtin.gc b/src/mesa/shader/slang/library/slang_common_builtin.gc
index 768cef54748..0e94979d920 100755
--- a/src/mesa/shader/slang/library/slang_common_builtin.gc
+++ b/src/mesa/shader/slang/library/slang_common_builtin.gc
@@ -26,6 +26,8 @@
 // From Shader Spec, ver. 1.10, rev. 59
 //
 
+//bp: XXX these will probably go away since the value needs to be
+//determined at runtime and may vary from one GLcontext to another...
 const int gl_MaxLights = 8;
 const int gl_MaxClipPlanes = 6;
 const int gl_MaxTextureUnits = 8;
@@ -155,128 +157,159 @@ struct gl_FogParameters {
 
 uniform gl_FogParameters gl_Fog;
 
+
+
+
+
 //
 // 8.1 Angle and Trigonometry Functions
 //
 
-float radians (float deg) {
-    return 3.141593 * deg / 180.0;
+//// radians
+
+float radians(const float deg)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal.x, deg, c;
 }
 
-vec2 radians (vec2 deg) {
-    return vec2 (3.141593) * deg / vec2 (180.0);
+vec2 radians(const vec2 deg)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal.xy, deg.xy, c.xx;
 }
 
-vec3 radians (vec3 deg) {
-    return vec3 (3.141593) * deg / vec3 (180.0);
+vec3 radians(const vec3 deg)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal.xyz, deg.xyz, c.xxx;
 }
 
-vec4 radians (vec4 deg) {
-    return vec4 (3.141593) * deg / vec4 (180.0);
+vec4 radians(const vec4 deg)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal, deg, c.xxxx;
 }
 
-float degrees (float rad) {
-    return 180.0 * rad / 3.141593;
+
+//// degrees
+
+float degrees(const float rad)
+{
+   const float c = 180.0 / 3.1415926;
+   __asm vec4_multiply __retVal.x, rad, c;
 }
 
-vec2 degrees (vec2 rad) {
-    return vec2 (180.0) * rad / vec2 (3.141593);
+vec2 degrees(const vec2 rad)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal.xy, rad.xy, c.xx;
 }
 
-vec3 degrees (vec3 rad) {
-    return vec3 (180.0) * rad / vec3 (3.141593);
+vec3 degrees(const vec3 rad)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal.xyz, rad.xyz, c.xxx;
 }
 
-vec4 degrees (vec4 rad) {
-    return vec4 (180.0) * rad / vec4 (3.141593);
+vec4 degrees(const vec4 rad)
+{
+   const float c = 3.1415926 / 180.0;
+   __asm vec4_multiply __retVal, rad, c.xxxx;
 }
 
-float sin (float angle) {
-    float x;
-    __asm float_sine x, angle;
-    return x;
+
+//// sin
+
+float sin(const float radians)
+{
+   __asm float_sine __retVal.x, radians;
 }
 
-vec2 sin (vec2 angle) {
-    return vec2 (
-        sin (angle.x),
-        sin (angle.y)
-    );
+vec2 sin(const vec2 radians)
+{
+   __asm float_sine __retVal.x, radians.x;
+   __asm float_sine __retVal.y, radians.y;
 }
 
-vec3 sin (vec3 angle) {
-    return vec3 (
-        sin (angle.x),
-        sin (angle.y),
-        sin (angle.z)
-    );
+vec3 sin(const vec3 radians)
+{
+   __asm float_sine __retVal.x, radians.x;
+   __asm float_sine __retVal.y, radians.y;
+   __asm float_sine __retVal.z, radians.z;
 }
 
-vec4 sin (vec4 angle) {
-    return vec4 (
-        sin (angle.x),
-        sin (angle.y),
-        sin (angle.z),
-        sin (angle.w)
-    );
+vec4 sin(const vec4 radians)
+{
+   __asm float_sine __retVal.x, radians.x;
+   __asm float_sine __retVal.y, radians.y;
+   __asm float_sine __retVal.z, radians.z;
+   __asm float_sine __retVal.w, radians.w;
 }
 
-float cos (float angle) {
-    return sin (angle + 1.5708);
+
+//// cos
+
+float cos(const float radians)
+{
+   __asm float_cosine __retVal.x, radians;
 }
 
-vec2 cos (vec2 angle) {
-    return vec2 (
-        cos (angle.x),
-        cos (angle.y)
-    );
+vec2 cos(const vec2 radians)
+{
+   __asm float_cosine __retVal.x, radians.x;
+   __asm float_cosine __retVal.y, radians.y;
 }
 
-vec3 cos (vec3 angle) {
-    return vec3 (
-        cos (angle.x),
-        cos (angle.y),
-        cos (angle.z)
-    );
+vec3 cos(const vec3 radians)
+{
+   __asm float_cosine __retVal.x, radians.x;
+   __asm float_cosine __retVal.y, radians.y;
+   __asm float_cosine __retVal.z, radians.z;
 }
 
-vec4 cos (vec4 angle) {
-    return vec4 (
-        cos (angle.x),
-        cos (angle.y),
-        cos (angle.z),
-        cos (angle.w)
-    );
+vec4 cos(const vec4 radians)
+{
+   __asm float_cosine __retVal.x, radians.x;
+   __asm float_cosine __retVal.y, radians.y;
+   __asm float_cosine __retVal.z, radians.z;
+   __asm float_cosine __retVal.w, radians.w;
 }
 
-float tan (float angle) {
-    return sin (angle) / cos (angle);
+
+
+//// tan
+
+float tan(const float angle)
+{
+   const float s = sin(angle);
+   const float c = cos(angle);
+   return s / c;
 }
 
-vec2 tan (vec2 angle) {
-    return vec2 (
-        tan (angle.x),
-        tan (angle.y)
-    );
+vec2 tan(const vec2 angle)
+{
+   const vec2 s = sin(angle);
+   const vec2 c = cos(angle);
+   return s / c;
 }
 
-vec3 tan (vec3 angle) {
-    return vec3 (
-        tan (angle.x),
-        tan (angle.y),
-        tan (angle.z)
-    );
+vec3 tan(const vec3 angle)
+{
+   const vec3 s = sin(angle);
+   const vec3 c = cos(angle);
+   return s / c;
 }
 
-vec4 tan (vec4 angle) {
-    return vec4 (
-        tan (angle.x),
-        tan (angle.y),
-        tan (angle.z),
-        tan (angle.w)
-    );
+vec4 tan(const vec4 angle)
+{
+   const vec4 s = sin(angle);
+   const vec4 c = cos(angle);
+   return s / c;
 }
 
+
+
 float asin (float x) {
     float y;
     __asm float_arcsine y, x;
@@ -404,675 +437,884 @@ vec4 atan (vec4 u, vec4 v) {
 // 8.2 Exponential Functions
 //
 
-float pow (float x, float y) {
-    float p;
-    __asm float_power p, x, y;
-    return p;
+//// pow
+
+float pow(const float a, const float b)
+{
+   __asm float_power __retVal.x, a, b;
 }
 
-vec2 pow (vec2 v, vec2 u) {
-    return vec2 (
-        pow (v.x, u.x),
-        pow (v.y, u.y)
-    );
+vec2 pow(const vec2 a, const vec2 b)
+{
+   __asm float_power __retVal.x, a.x, b.x;
+   __asm float_power __retVal.y, a.y, b.y;
 }
 
-vec3 pow (vec3 v, vec3 u) {
-    return vec3 (
-        pow (v.x, u.x),
-        pow (v.y, u.y),
-        pow (v.z, u.z)
-    );
+vec3 pow(const vec3 a, const vec3 b)
+{
+   __asm float_power __retVal.x, a.x, b.x;
+   __asm float_power __retVal.y, a.y, b.y;
+   __asm float_power __retVal.z, a.z, b.z;
 }
 
-vec4 pow (vec4 v, vec4 u) {
-    return vec4 (
-        pow (v.x, u.x),
-        pow (v.y, u.y),
-        pow (v.z, u.z),
-        pow (v.w, u.w)
-    );
+vec4 pow(const vec4 a, const vec4 b)
+{
+   __asm float_power __retVal.x, a.x, b.x;
+   __asm float_power __retVal.y, a.y, b.y;
+   __asm float_power __retVal.z, a.z, b.z;
+   __asm float_power __retVal.w, a.w, b.w;
 }
 
-float exp (float x) {
-    return pow (2.71828183, x);
+
+//// exp
+
+float exp(const float a)
+{
+   __asm float_exp __retVal.x, a;
 }
 
-vec2 exp (vec2 v) {
-    return pow (vec2 (2.71828183), v);
+vec2 exp(const vec2 a)
+{
+   __asm float_exp __retVal.x, a.x;
+   __asm float_exp __retVal.y, a.y;
 }
 
-vec3 exp (vec3 v) {
-    return pow (vec3 (2.71828183), v);
+vec3 exp(const vec3 a)
+{
+   __asm float_exp __retVal.x, a.x;
+   __asm float_exp __retVal.y, a.y;
+   __asm float_exp __retVal.z, a.z;
 }
 
-vec4 exp (vec4 v) {
-    return pow (vec4 (2.71828183), v);
+vec4 exp(const vec4 a)
+{
+   __asm float_exp __retVal.x, a.x;
+   __asm float_exp __retVal.y, a.y;
+   __asm float_exp __retVal.z, a.z;
+   __asm float_exp __retVal.w, a.w;
 }
 
-float log2 (float x) {
-    float y;
-    __asm float_log2 y, x;
-    return y;
+
+
+//// log2
+
+float log2(const float x)
+{
+   __asm float_log2 __retVal.x, x;
 }
 
-vec2 log2 (vec2 v) {
-    return vec2 (
-        log2 (v.x),
-        log2 (v.y)
-    );
+vec2 log2(const vec2 v)
+{
+   __asm float_log2 __retVal.x, v.x;
+   __asm float_log2 __retVal.y, v.y;
 }
 
-vec3 log2 (vec3 v) {
-    return vec3 (
-        log2 (v.x),
-        log2 (v.y),
-        log2 (v.z)
-    );
+vec3 log2(const vec3 v)
+{
+   __asm float_log2 __retVal.x, v.x;
+   __asm float_log2 __retVal.y, v.y;
+   __asm float_log2 __retVal.z, v.z;
 }
 
-vec4 log2 (vec4 v) {
-    return vec4 (
-        log2 (v.x),
-        log2 (v.y),
-        log2 (v.z),
-        log2 (v.w)
-    );
+vec4 log2(const vec4 v)
+{
+   __asm float_log2 __retVal.x, v.x;
+   __asm float_log2 __retVal.y, v.y;
+   __asm float_log2 __retVal.z, v.z;
+   __asm float_log2 __retVal.w, v.w;
 }
 
-float log (float x) {
-    return log2 (x) / log2 (2.71828183);
+
+//// log  (natural log)
+
+float log(const float x)
+{
+   // note:  logBaseB(x) = logBaseN(x) / logBaseN(B)
+   // compute log(x) = log2(x) / log2(e)
+   // c = 1.0 / log2(e) = 0.693147181
+   const float c = 0.693147181;
+   return log2(x) * c;
 }
 
-vec2 log (vec2 v) {
-    return log2 (v) / log2 (vec2 (2.71828183));
+vec2 log(const vec2 v)
+{
+   const float c = 0.693147181;
+   return log2(v) * c;
 }
 
-vec3 log (vec3 v) {
-    return log2 (v) / log2 (vec3 (2.71828183));
+vec3 log(const vec3 v)
+{
+   const float c = 0.693147181;
+   return log2(v) * c;
 }
 
-vec4 log (vec4 v) {
-    return log2 (v) / log2 (vec4 (2.71828183));
+vec4 log(const vec4 v)
+{
+   const float c = 0.693147181;
+   return log2(v) * c;
 }
 
-float exp2 (float x) {
-    return pow (2.0, x);
+
+//// exp2
+
+float exp2(const float a)
+{
+   __asm float_exp2 __retVal.x, a;
 }
 
-vec2 exp2 (vec2 v) {
-    return pow (vec2 (2.0), v);
+vec2 exp2(const vec2 a)
+{
+   __asm float_exp2 __retVal.x, a.x;
+   __asm float_exp2 __retVal.y, a.y;
 }
 
-vec3 exp2 (vec3 v) {
-    return pow (vec3 (2.0), v);
+vec3 exp2(const vec3 a)
+{
+   __asm float_exp2 __retVal.x, a.x;
+   __asm float_exp2 __retVal.y, a.y;
+   __asm float_exp2 __retVal.z, a.z;
 }
 
-vec4 exp2 (vec4 v) {
-    return pow (vec4 (2.0), v);
+vec4 exp2(const vec4 a)
+{
+   __asm float_exp2 __retVal.x, a.x;
+   __asm float_exp2 __retVal.y, a.y;
+   __asm float_exp2 __retVal.z, a.z;
+   __asm float_exp2 __retVal.w, a.w;
 }
 
-float sqrt (float x) {
-    return pow (x, 0.5);
+
+//// sqrt
+
+float sqrt(const float x)
+{
+   float r;
+   __asm float_rsq r, x;
+   __asm float_rcp __retVal.x, r;
 }
 
-vec2 sqrt (vec2 v) {
-    return pow (v, vec2 (0.5));
+vec2 sqrt(const vec2 v)
+{
+   float r;
+   __asm float_rsq r, v.x;
+   __asm float_rcp __retVal.x, r;
+   __asm float_rsq r, v.y;
+   __asm float_rcp __retVal.y, r;
 }
 
-vec3 sqrt (vec3 v) {
-    return pow (v, vec3 (0.5));
+vec3 sqrt(const vec3 v)
+{
+   float r;
+   __asm float_rsq r, v.x;
+   __asm float_rcp __retVal.x, r;
+   __asm float_rsq r, v.y;
+   __asm float_rcp __retVal.y, r;
+   __asm float_rsq r, v.z;
+   __asm float_rcp __retVal.z, r;
 }
 
-vec4 sqrt (vec4 v) {
-    return pow (v, vec4 (0.5));
+vec4 sqrt(const vec4 v)
+{
+   float r;
+   __asm float_rsq r, v.x;
+   __asm float_rcp __retVal.x, r;
+   __asm float_rsq r, v.y;
+   __asm float_rcp __retVal.y, r;
+   __asm float_rsq r, v.z;
+   __asm float_rcp __retVal.z, r;
+   __asm float_rsq r, v.w;
+   __asm float_rcp __retVal.w, r;
 }
 
-float inversesqrt (float x) {
-    return 1.0 / sqrt (x);
+
+//// inversesqrt
+
+float inversesqrt(const float x)
+{
+   __asm float_rsq __retVal.x, x;
 }
 
-vec2 inversesqrt (vec2 v) {
-    return vec2 (1.0) / sqrt (v);
+vec2 inversesqrt(const vec2 v)
+{
+   __asm float_rsq __retVal.x, v.x;
+   __asm float_rsq __retVal.y, v.y;
 }
 
-vec3 inversesqrt (vec3 v) {
-    return vec3 (1.0) / sqrt (v);
+vec3 inversesqrt(const vec3 v)
+{
+   __asm float_rsq __retVal.x, v.x;
+   __asm float_rsq __retVal.y, v.y;
+   __asm float_rsq __retVal.z, v.z;
 }
 
-vec4 inversesqrt (vec4 v) {
-    return vec4 (1.0) / sqrt (v);
+vec4 inversesqrt(const vec4 v)
+{
+   __asm float_rsq __retVal.x, v.x;
+   __asm float_rsq __retVal.y, v.y;
+   __asm float_rsq __retVal.z, v.z;
+   __asm float_rsq __retVal.w, v.w;
 }
 
+
+//// normalize
+
+float normalize(const float x)
+{
+   __retVal.x = 1.0;
+}
+
+vec2 normalize(const vec2 v)
+{
+   const float s = inversesqrt(dot(v, v));
+   __asm vec4_multiply __retVal.xy, v, s.xx;
+}
+
+vec3 normalize(const vec3 v)
+{
+   const float s = inversesqrt(dot(v, v));
+   __asm vec4_multiply __retVal.xyz, v, s.xxx;
+}
+
+vec4 normalize(const vec4 v)
+{
+   const float s = inversesqrt(dot(v, v));
+   __asm vec4_multiply __retVal, v, s.xxxx;
+}
+
+
+
 //
 // 8.3 Common Functions
 //
 
-float abs (float x) {
-    return x >= 0.0 ? x : -x;
+
+//// abs
+
+float abs(const float a)
+{
+   __asm vec4_abs __retVal.x, a;
 }
 
-vec2 abs (vec2 v) {
-    return vec2 (
-        abs (v.x),
-        abs (v.y)
-    );
+vec2 abs(const vec2 a)
+{
+   __asm vec4_abs __retVal.xy, a;
 }
 
-vec3 abs (vec3 v) {
-    return vec3 (
-        abs (v.x),
-        abs (v.y),
-        abs (v.z)
-    );
+vec3 abs(const vec3 a)
+{
+   __asm vec4_abs __retVal.xyz, a;
 }
 
-vec4 abs (vec4 v) {
-    return vec4 (
-        abs (v.x),
-        abs (v.y),
-        abs (v.z),
-        abs (v.w)
-    );
+vec4 abs(const vec4 a)
+{
+   __asm vec4_abs __retVal, a;
 }
 
-float sign (float x) {
-    return x > 0.0 ? 1.0 : x < 0.0 ? -1.0 : 0.0;
+
+//// sign
+
+float sign(const float x)
+{
+   float p, n;
+   __asm vec4_sgt p.x, x, 0.0;            // p = (x > 0)
+   __asm vec4_sgt n.x, 0.0, x;            // n = (x < 0)
+   __asm vec4_subtract __retVal.x, p, n;  // sign = p - n
 }
 
-vec2 sign (vec2 v) {
-    return vec2 (
-        sign (v.x),
-        sign (v.y)
-    );
+vec2 sign(const vec2 v)
+{
+   vec2 p, n;
+   __asm vec4_sgt p.xy, v, 0.0;
+   __asm vec4_sgt n.xy, 0.0, v;
+   __asm vec4_subtract __retVal.xy, p, n;
 }
 
-vec3 sign (vec3 v) {
-    return vec3 (
-        sign (v.x),
-        sign (v.y),
-        sign (v.z)
-    );
+vec3 sign(const vec3 v)
+{
+   vec3 p, n;
+   __asm vec4_sgt p.xyz, v, 0.0;
+   __asm vec4_sgt n.xyz, 0.0, v;
+   __asm vec4_subtract __retVal.xyz, p, n;
 }
 
-vec4 sign (vec4 v) {
-    return vec4 (
-        sign (v.x),
-        sign (v.y),
-        sign (v.z),
-        sign (v.w)
-    );
+vec4 sign(const vec4 v)
+{
+   vec4 p, n;
+   __asm vec4_sgt p, v, 0.0;
+   __asm vec4_sgt n, 0.0, v;
+   __asm vec4_subtract __retVal, p, n;
 }
 
-float floor (float x) {
-    float y;
-    __asm float_floor y, x;
-    return y;
+
+//// floor
+
+float floor(const float a)
+{
+   __asm vec4_floor __retVal.x, a;
 }
 
-vec2 floor (vec2 v) {
-    return vec2 (
-        floor (v.x),
-        floor (v.y)
-    );
+vec2 floor(const vec2 a)
+{
+   __asm vec4_floor __retVal.xy, a;
 }
 
-vec3 floor (vec3 v) {
-    return vec3 (
-        floor (v.x),
-        floor (v.y),
-        floor (v.z)
-    );
+vec3 floor(const vec3 a)
+{
+   __asm vec4_floor __retVal.xyz, a;
 }
 
-vec4 floor (vec4 v) {
-    return vec4 (
-        floor (v.x),
-        floor (v.y),
-        floor (v.z),
-        floor (v.w)
-    );
+vec4 floor(const vec4 a)
+{
+   __asm vec4_floor __retVal, a;
 }
 
-float ceil (float x) {
-    float y;
-    __asm float_ceil y, x;
-    return y;
+
+//// ceil
+
+float ceil(const float a)
+{
+   // XXX this could be improved
+   float b = -a;
+   __asm vec4_floor b, b;
+   __retVal.x = -b;
 }
 
-vec2 ceil (vec2 v) {
-    return vec2 (
-        ceil (v.x),
-        ceil (v.y)
-    );
+vec2 ceil(const vec2 a)
+{
+   vec2 b = -a;
+   __asm vec4_floor b, b;
+   __retVal.xy = -b;
 }
 
-vec3 ceil (vec3 v) {
-    return vec3 (
-        ceil (v.x),
-        ceil (v.y),
-        ceil (v.z)
-    );
+vec3 ceil(const vec3 a)
+{
+   vec3 b = -a;
+   __asm vec4_floor b, b;
+   __retVal.xyz = -b;
 }
 
-vec4 ceil (vec4 v) {
-    return vec4 (
-        ceil (v.x),
-        ceil (v.y),
-        ceil (v.z),
-        ceil (v.w)
-    );
+vec4 ceil(const vec4 a)
+{
+   vec4 b = -a;
+   __asm vec4_floor b, b;
+   __retVal = -b;
 }
 
-float fract (float x) {
-    return x - floor (x);
+
+//// fract
+
+float fract(const float a)
+{
+   __asm vec4_frac __retVal.x, a;
 }
 
-vec2 fract (vec2 v) {
-    return v - floor (v);
+vec2 fract(const vec2 a)
+{
+   __asm vec4_frac __retVal.xy, a;
 }
 
-vec3 fract (vec3 v) {
-    return v - floor (v);
+vec3 fract(const vec3 a)
+{
+   __asm vec4_frac __retVal.xyz, a;
 }
 
-vec4 fract (vec4 v) {
-    return v - floor (v);
+vec4 fract(const vec4 a)
+{
+   __asm vec4_frac __retVal, a;
 }
 
-float mod (float x, float y) {
-    return x - y * floor (x / y);
+
+//// mod  (very untested!)
+
+float mod(const float a, const float b)
+{
+    float oneOverB;
+    __asm float_rcp oneOverB, b;
+    __retVal.x = a - b * floor(a * oneOverB);
 }
 
-vec2 mod (vec2 v, float u) {
-    return v - u * floor (v / u);
+vec2 mod(const vec2 a, const float b)
+{
+    float oneOverB;
+    __asm float_rcp oneOverB, b;
+    __retVal.xy = a - b * floor(a * oneOverB);
 }
 
-vec3 mod (vec3 v, float u) {
-    return v - u * floor (v / u);
+vec3 mod(const vec3 a, const float b)
+{
+    float oneOverB;
+    __asm float_rcp oneOverB, b;
+    __retVal.xyz = a - b * floor(a * oneOverB);
 }
 
-vec4 mod (vec4 v, float u) {
-    return v - u * floor (v / u);
+vec4 mod(const vec4 a, const float b)
+{
+    float oneOverB;
+    __asm float_rcp oneOverB, b;
+    __retVal = a - b * floor(a * oneOverB);
 }
 
-vec2 mod (vec2 v, vec2 u) {
-    return v - u * floor (v / u);
+vec2 mod(const vec2 a, const vec2 b)
+{
+    float oneOverBx, oneOverBy;
+    __asm float_rcp oneOverBx, b.x;
+    __asm float_rcp oneOverBy, b.y;
+    __retVal.x = a.x - b.x * floor(a.x * oneOverBx);
+    __retVal.y = a.y - b.y * floor(a.y * oneOverBy);
 }
 
-vec3 mod (vec3 v, vec3 u) {
-    return v - u * floor (v / u);
+vec3 mod(const vec3 a, const vec3 b)
+{
+    float oneOverBx, oneOverBy, oneOverBz;
+    __asm float_rcp oneOverBx, b.x;
+    __asm float_rcp oneOverBy, b.y;
+    __asm float_rcp oneOverBz, b.z;
+    __retVal.x = a.x - b.x * floor(a.x * oneOverBx);
+    __retVal.y = a.y - b.y * floor(a.y * oneOverBy);
+    __retVal.z = a.z - b.z * floor(a.z * oneOverBz);
 }
 
-vec4 mod (vec4 v, vec4 u) {
-    return v - u * floor (v / u);
+vec4 mod(const vec4 a, const vec4 b)
+{
+    float oneOverBx, oneOverBy, oneOverBz, oneOverBw;
+    __asm float_rcp oneOverBx, b.x;
+    __asm float_rcp oneOverBy, b.y;
+    __asm float_rcp oneOverBz, b.z;
+    __asm float_rcp oneOverBw, b.w;
+    __retVal.x = a.x - b.x * floor(a.x * oneOverBx);
+    __retVal.y = a.y - b.y * floor(a.y * oneOverBy);
+    __retVal.z = a.z - b.z * floor(a.z * oneOverBz);
+    __retVal.w = a.w - b.w * floor(a.w * oneOverBz);
 }
 
-float min (float x, float y) {
-    return x < y ? x : y;
+
+//// min
+
+float min(const float a, const float b)
+{
+   __asm vec4_min __retVal.x, a.x, b.x;
 }
 
-vec2 min (vec2 v, vec2 u) {
-    return vec2 (
-        min (v.x, u.x),
-        min (v.y, u.y)
-    );
+vec2 min(const vec2 a, const vec2 b)
+{
+   __asm vec4_min __retVal.xy, a.xy, b.xy;
 }
 
-vec3 min (vec3 v, vec3 u) {
-    return vec3 (
-        min (v.x, u.x),
-        min (v.y, u.y),
-        min (v.z, u.z)
-    );
+vec3 min(const vec3 a, const vec3 b)
+{
+   __asm vec4_min __retVal.xyz, a.xyz, b.xyz;
 }
 
-vec4 min (vec4 v, vec4 u) {
-    return vec4 (
-        min (v.x, u.x),
-        min (v.y, u.y),
-        min (v.z, u.z),
-        min (v.w, u.w)
-    );
+vec4 min(const vec4 a, const vec4 b)
+{
+   __asm vec4_min __retVal, a, b;
 }
 
-vec2 min (vec2 v, float y) {
-    return min (v, vec2 (y));
+vec2 min(const vec2 a, const float b)
+{
+   __asm vec4_min __retVal, a.xy, b.xx;
 }
 
-vec3 min (vec3 v, float y) {
-    return min (v, vec3 (y));
+vec3 min(const vec3 a, const float b)
+{
+   __asm vec4_min __retVal, a.xyz, b.xxx;
 }
 
-vec4 min (vec4 v, float y) {
-    return min (v, vec4 (y));
+vec4 min(const vec4 a, const float b)
+{
+   __asm vec4_min __retVal, a, b.xxxx;
 }
 
-float max (float x, float y) {
-    return x < y ? y : x;
+
+//// max
+
+float max(const float a, const float b)
+{
+   __asm vec4_max __retVal.x, a.x, b.x;
 }
 
-vec2 max (vec2 v, vec2 u) {
-    return vec2 (
-        max (v.x, u.x),
-        max (v.y, u.y)
-    );
+vec2 max(const vec2 a, const vec2 b)
+{
+   __asm vec4_max __retVal.xy, a.xy, b.xy;
 }
 
-vec3 max (vec3 v, vec3 u) {
-    return vec3 (
-        max (v.x, u.x),
-        max (v.y, u.y),
-        max (v.z, u.z)
-    );
+vec3 max(const vec3 a, const vec3 b)
+{
+   __asm vec4_max __retVal.xyz, a.xyz, b.xyz;
 }
 
-vec4 max (vec4 v, vec4 u) {
-    return vec4 (
-        max (v.x, u.x),
-        max (v.y, u.y),
-        max (v.z, u.z),
-        max (v.w, u.w)
-    );
+vec4 max(const vec4 a, const vec4 b)
+{
+   __asm vec4_max __retVal, a, b;
 }
 
-vec2 max (vec2 v, float y) {
-    return max (v, vec2 (y));
+vec2 max(const vec2 a, const float b)
+{
+   __asm vec4_max __retVal, a.xy, b.xx;
 }
 
-vec3 max (vec3 v, float y) {
-    return max (v, vec3 (y));
+vec3 max(const vec3 a, const float b)
+{
+   __asm vec4_max __retVal, a.xyz, b.xxx;
 }
 
-vec4 max (vec4 v, float y) {
-    return max (v, vec4 (y));
+vec4 max(const vec4 a, const float b)
+{
+   __asm vec4_max __retVal, a, b.xxxx;
 }
 
-float clamp (float x, float minVal, float maxVal) {
-    return min (max (x, minVal), maxVal);
+
+//// clamp
+
+float clamp(const float val, const float minVal, const float maxVal)
+{
+   float t;
+   __asm vec4_max t, val, minVal;
+   __asm vec4_min __retVal.x, t, maxVal;
 }
 
-vec2 clamp (vec2 x, float minVal, float maxVal) {
-    return min (max (x, minVal), maxVal);
+vec2 clamp(const vec2 val, const float minVal, const float maxVal)
+{
+   vec2 t;
+   __asm vec4_max t.xy, val.xy, minVal.xx;
+   __asm vec4_min __retVal.xy, t.xy, maxVal.xx;
 }
 
-vec3 clamp (vec3 x, float minVal, float maxVal) {
-    return min (max (x, minVal), maxVal);
+vec3 clamp(const vec3 val, const float minVal, const float maxVal)
+{
+   vec3 t;
+   __asm vec4_max t.xyz, val.xyz, minVal.xxx;
+   __asm vec4_min __retVal.xyz, t.xyz, maxVal.xxx;
 }
 
-vec4 clamp (vec4 x, float minVal, float maxVal) {
-    return min (max (x, minVal), maxVal);
+vec4 clamp(const vec4 val, const float minVal, const float maxVal)
+{
+   vec4 t;
+   __asm vec4_max t, val, minVal.xxxx;
+   __asm vec4_min __retVal, t, maxVal.xxxx;
 }
 
-vec2 clamp (vec2 x, vec2 minVal, vec2 maxVal) {
-    return min (max (x, minVal), maxVal);
+vec2 clamp(const vec2 val, const vec2 minVal, const vec2 maxVal)
+{
+   vec2 t;
+   __asm vec4_max t.xy, val.xy, minVal.xy;
+   __asm vec4_min __retVal.xy, t.xy, maxVal.xxxx;
 }
 
-vec3 clamp (vec3 x, vec3 minVal, vec3 maxVal) {
-    return min (max (x, minVal), maxVal);
+vec3 clamp(const vec3 val, const vec3 minVal, const vec3 maxVal)
+{
+   vec3 t;
+   __asm vec4_max t.xyz, val.xyz, minVal.xyz;
+   __asm vec4_min __retVal.xyz, t.xyz, maxVal.xxxx;
 }
 
-vec4 clamp (vec4 x, vec4 minVal, vec4 maxVal) {
-    return min (max (x, minVal), maxVal);
+vec4 clamp(const vec4 val, const vec4 minVal, const vec4 maxVal)
+{
+   vec4 t;
+   __asm vec4_max t, val, minVal;
+   __asm vec4_min __retVal, t, maxVal;
 }
 
-float mix (float x, float y, float a) {
-    return x * (1.0 - a) + y * a;
+
+//// mix
+
+float mix(const float x, const float y, const float a)
+{
+   const float d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec2 mix (vec2 x, vec2 y, float a) {
-    return x * (1.0 - a) + y * a;
+vec2 mix(const vec2 x, const vec2 y, const float a)
+{
+   const vec2 d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec3 mix (vec3 x, vec3 y, float a) {
-    return x * (1.0 - a) + y * a;
+vec3 mix(const vec3 x, const vec3 y, const float a)
+{
+   const vec3 d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec4 mix (vec4 x, vec4 y, float a) {
-    return x * (1.0 - a) + y * a;
+vec4 mix(const vec4 x, const vec4 y, const float a)
+{
+   const vec4 d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec2 mix (vec2 x, vec2 y, vec2 a) {
-    return x * (1.0 - a) + y * a;
+vec2 mix(const vec2 x, const vec2 y, const vec2 a)
+{
+   const vec2 d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec3 mix (vec3 x, vec3 y, vec3 a) {
-    return x * (1.0 - a) + y * a;
+vec3 mix(const vec3 x, const vec3 y, const vec3 a)
+{
+   const vec3 d = y - x;
+   return x + d * a;  // MAD
 }
 
-vec4 mix (vec4 x, vec4 y, vec4 a) {
-    return x * (1.0 - a) + y * a;
+vec4 mix(const vec4 x, const vec4 y, const vec4 a)
+{
+   const vec4 d = y - x;
+   return x + d * a;  // MAD
 }
 
-float step (float edge, float x) {
-    return x < edge ? 0.0 : 1.0;
+
+//// step (untested)
+
+float step(const float edge, const float x)
+{
+   __asm vec4_sgt __retVal.x, x, edge;
 }
 
-vec2 step (vec2 edge, vec2 v) {
-    return vec2 (
-        step (edge.x, v.x),
-        step (edge.y, v.y)
-    );
+vec2 step(const vec2 edge, const vec2 x)
+{
+   __asm vec4_sgt __retVal.xy, x, edge;
 }
 
-vec3 step (vec3 edge, vec3 v) {
-    return vec3 (
-        step (edge.x, v.x),
-        step (edge.y, v.y),
-        step (edge.z, v.z)
-    );
+vec3 step(const vec3 edge, const vec3 x)
+{
+   __asm vec4_sgt __retVal.xyz, x, edge;
 }
 
-vec4 step (vec4 edge, vec4 v) {
-    return vec4 (
-        step (edge.x, v.x),
-        step (edge.y, v.y),
-        step (edge.z, v.z),
-        step (edge.w, v.w)
-    );
+vec4 step(const vec4 edge, const vec4 x)
+{
+   __asm vec4_sgt __retVal, x, edge;
 }
 
-vec2 step (float edge, vec2 v) {
-    return step (vec2 (edge), v);
+vec2 step(const float edge, const vec2 v)
+{
+   __asm vec4_sgt __retVal.xy, v, edge.xx;
 }
 
-vec3 step (float edge, vec3 v) {
-    return step (vec3 (edge), v);
+vec3 step(const float edge, const vec3 v)
+{
+   __asm vec4_sgt __retVal.xyz, v, edge.xxx;
 }
 
-vec4 step (float edge, vec4 v) {
-    return step (vec4 (edge), v);
+vec4 step(const float edge, const vec4 v)
+{
+   __asm vec4_sgt __retVal, v, edge.xxxx;
 }
 
-float smoothstep (float edge0, float edge1, float x) {
-    float t = clamp ((x - edge0) / (edge1 - edge0), 0.0, 1.0);
+
+//// smoothstep (untested)
+
+float smoothstep(const float edge0, const float edge1, const float x)
+{
+    float t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0);
     return t * t * (3.0 - 2.0 * t);
 }
 
-vec2 smoothstep (vec2 edge0, vec2 edge1, vec2 v) {
-    return vec2 (
-        smoothstep (edge0.x, edge1.x, v.x),
-        smoothstep (edge0.y, edge1.y, v.y)
-    );
+vec2 smoothstep(const vec2 edge0, const vec2 edge1, const vec2 v)
+{
+   vec2 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
-vec3 smoothstep (vec3 edge0, vec3 edge1, vec3 v) {
-    return vec3 (
-        smoothstep (edge0.x, edge1.x, v.x),
-        smoothstep (edge0.y, edge1.y, v.y),
-        smoothstep (edge0.z, edge1.z, v.z)
-    );
+vec3 smoothstep(const vec3 edge0, const vec3 edge1, const vec3 v)
+{
+   vec3 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
-vec4 smoothstep (vec4 edge0, vec4 edge1, vec4 v) {
-    return vec4 (
-        smoothstep (edge0.x, edge1.x, v.x),
-        smoothstep (edge0.y, edge1.y, v.y),
-        smoothstep (edge0.z, edge1.z, v.z),
-        smoothstep (edge0.w, edge1.w, v.w)
-    );
+vec4 smoothstep(const vec4 edge0, const vec4 edge1, const vec4 v)
+{
+   vec4 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
-vec2 smoothstep (float edge0, float edge1, vec2 v) {
-    return vec2 (
-        smoothstep (edge0, edge1, v.x),
-        smoothstep (edge0, edge1, v.y)
-    );
+vec2 smoothstep(const float edge0, const float edge1, const vec2 v)
+{
+   vec2 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
-vec3 smoothstep (float edge0, float edge1, vec3 v) {
-    return vec3 (
-        smoothstep (edge0, edge1, v.x),
-        smoothstep (edge0, edge1, v.y),
-        smoothstep (edge0, edge1, v.z)
-    );
+vec3 smoothstep(const float edge0, const float edge1, const vec3 v)
+{
+   vec3 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
-vec4 smoothstep (float edge0, float edge1, vec4 v) {
-    return vec4 (
-        smoothstep (edge0, edge1, v.x),
-        smoothstep (edge0, edge1, v.y),
-        smoothstep (edge0, edge1, v.z),
-        smoothstep (edge0, edge1, v.w)
-    );
+vec4 smoothstep(const float edge0, const float edge1, const vec4 v)
+{
+   vec4 t = clamp((v - edge0) / (edge1 - edge0), 0.0, 1.0);
+   return t * t * (3.0 - 2.0 * t);
 }
 
+
+
 //
 // 8.4 Geometric Functions
 //
 
-float dot (float x, float y) {
-    return x * y;
-}
 
-float dot (vec2 v, vec2 u) {
-    return v.x * u.x + v.y * u.y;
-}
+//// length
 
-float dot (vec3 v, vec3 u) {
-    return v.x * u.x + v.y * u.y + v.z * u.z;
+float length(const float x)
+{
+   return abs(x);
 }
 
-float dot (vec4 v, vec4 u) {
-    return v.x * u.x + v.y * u.y + v.z * u.z + v.w * u.w;
+float length(const vec2 v)
+{
+   float r;
+   const float p = dot(v, v);      // p = v.x * v.x + v.y * v.y
+   __asm float_rsq r, p;           // r = 1 / sqrt(p)
+   __asm float_rcp __retVal.x, r;  // retVal = 1 / r
 }
 
-float length (float x) {
-    return sqrt (dot (x, x));
+float length(const vec3 v)
+{
+   float r;
+   const float p = dot(v, v);      // p = v.x * v.x + v.y * v.y + v.z * v.z
+   __asm float_rsq r, p;           // r = 1 / sqrt(p)
+   __asm float_rcp __retVal.x, r;  // retVal = 1 / r
 }
 
-float length (vec2 v) {
-    return sqrt (dot (v, v));
+float length(const vec4 v)
+{
+   float r;
+   const float p = dot(v, v);      // p = v.x * v.x + v.y * v.y + ...
+   __asm float_rsq r, p;           // r = 1 / sqrt(p)
+   __asm float_rcp __retVal.x, r;  // retVal = 1 / r
 }
 
-float length (vec3 v) {
-    return sqrt (dot (v, v));
-}
 
-float length (vec4 v) {
-    return sqrt (dot (v, v));
-}
+//// distance
 
-float distance (float x, float y) {
-    return length (x - y);
+float distance(const float x, const float y)
+{
+   const float d = x - y;
+   return length(d);
 }
 
-float distance (vec2 v, vec2 u) {
-    return length (v - u);
+float distance(const vec2 v, const vec2 u)
+{
+   const vec2 d = v - u;
+   return length(d);
 }
 
-float distance (vec3 v, vec3 u) {
-    return length (v - u);
+float distance(const vec3 v, const vec3 u)
+{
+   const vec3 d = v - u;
+   return length(d);
 }
 
-float distance (vec4 v, vec4 u) {
-    return length (v - u);
+float distance(const vec4 v, const vec4 u)
+{
+   const vec4 d = v - u;
+   return length(d);
 }
 
-vec3 cross (vec3 v, vec3 u) {
-    return vec3 (
-        v.y * u.z - u.y * v.z,
-        v.z * u.x - u.z * v.x,
-        v.x * u.y - u.x * v.y
-    );
-}
 
-float normalize (float x) {
-    return 1.0;
-}
+//// cross
 
-vec2 normalize (vec2 v) {
-    return v / length (v);
+vec3 cross(const vec3 v, const vec3 u)
+{
+   __asm vec3_cross __retVal.xyz, v, u;
 }
 
-vec3 normalize (vec3 v) {
-    return v / length (v);
-}
 
-vec4 normalize (vec4 v) {
-    return v / length (v);
-}
+//// faceforward
 
-float faceforward (float N, float I, float Nref) {
-    return dot (Nref, I) < 0.0 ? N : -N;
+float faceforward(const float N, const float I, const float Nref)
+{
+    // this could probably be done better
+    const float d = dot(Nref, I);
+    float s;
+    __asm vec4_sgt s.x, 0.0, d;  // s = (0.0 > d) ? 1 : 0
+    return mix(-N, N, s);
 }
 
-vec2 faceforward (vec2 N, vec2 I, vec2 Nref) {
-    return dot (Nref, I) < 0.0 ? N : -N;
+vec2 faceforward(const vec2 N, const vec2 I, const vec2 Nref)
+{
+    // this could probably be done better
+    const float d = dot(Nref, I);
+    float s;
+    __asm vec4_sgt s.x, 0.0, d;  // s = (0.0 > d) ? 1 : 0
+    return mix(-N, N, s);
 }
 
-vec3 faceforward (vec3 N, vec3 I, vec3 Nref) {
-    return dot (Nref, I) < 0.0 ? N : -N;
+vec3 faceforward(const vec3 N, const vec3 I, const vec3 Nref)
+{
+    // this could probably be done better
+    const float d = dot(Nref, I);
+    float s;
+    __asm vec4_sgt s.x, 0.0, d;  // s = (0.0 > d) ? 1 : 0
+    return mix(-N, N, s);
 }
 
-vec4 faceforward (vec4 N, vec4 I, vec4 Nref) {
-    return dot (Nref, I) < 0.0 ? N : -N;
+vec4 faceforward(const vec4 N, const vec4 I, const vec4 Nref)
+{
+    // this could probably be done better
+    const float d = dot(Nref, I);
+    float s;
+    __asm vec4_sgt s.x, 0.0, d;  // s = (0.0 > d) ? 1 : 0
+    return mix(-N, N, s);
 }
 
-float reflect (float I, float N) {
-    return I - 2.0 * dot (N, I) * N;
+
+//// reflect
+
+float reflect(const float I, const float N)
+{
+   return I - 2.0 * dot(N, I) * N;
 }
 
-vec2 reflect (vec2 I, vec2 N) {
-    return I - 2.0 * dot (N, I) * N;
+vec2 reflect(const vec2 I, const vec2 N)
+{
+   return I - 2.0 * dot(N, I) * N;
 }
 
-vec3 reflect (vec3 I, vec3 N) {
-    return I - 2.0 * dot (N, I) * N;
+vec3 reflect(const vec3 I, const vec3 N)
+{
+   return I - 2.0 * dot(N, I) * N;
 }
 
-vec4 reflect (vec4 I, vec4 N) {
-    return I - 2.0 * dot (N, I) * N;
+vec4 reflect(const vec4 I, const vec4 N)
+{
+   return I - 2.0 * dot(N, I) * N;
 }
 
-float refract (float I, float N, float eta) {
-    float k = 1.0 - eta * eta * (1.0 - dot (N, I) * dot (N, I));
-    if (k < 0.0)
-        return 0.0;
-    return eta * I - (eta * dot (N, I) + sqrt (k)) * N;
+//// refract
+
+float refract(const float I, const float N, const float eta)
+{
+   float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I));
+   if (k < 0.0)
+       return 0.0;
+   return eta * I - (eta * dot(N, I) + sqrt(k)) * N;
 }
 
-vec2 refract (vec2 I, vec2 N, float eta) {
-    float k = 1.0 - eta * eta * (1.0 - dot (N, I) * dot (N, I));
-    if (k < 0.0)
-        return 0.0;
-    return eta * I - (eta * dot (N, I) + sqrt (k)) * N;
+vec2 refract(const vec2 I, const vec2 N, const float eta)
+{
+   float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I));
+   if (k < 0.0)
+      return 0.0;
+   return eta * I - (eta * dot(N, I) + sqrt(k)) * N;
 }
 
-vec3 refract (vec3 I, vec3 N, float eta) {
-    float k = 1.0 - eta * eta * (1.0 - dot (N, I) * dot (N, I));
-    if (k < 0.0)
-        return 0.0;
-    return eta * I - (eta * dot (N, I) + sqrt (k)) * N;
+vec3 refract(const vec3 I, const vec3 N, const float eta)
+{
+   float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I));
+   if (k < 0.0)
+      return 0.0;
+   return eta * I - (eta * dot(N, I) + sqrt(k)) * N;
 }
 
-vec4 refract (vec4 I, vec4 N, float eta) {
-    float k = 1.0 - eta * eta * (1.0 - dot (N, I) * dot (N, I));
-    if (k < 0.0)
-        return 0.0;
-    return eta * I - (eta * dot (N, I) + sqrt (k)) * N;
+vec4 refract(const vec4 I, const vec4 N, const float eta)
+{
+   float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I));
+   if (k < 0.0)
+      return 0.0;
+   return eta * I - (eta * dot(N, I) + sqrt(k)) * N;
 }
 
+
+
+
 //
 // 8.5 Matrix Functions
 //
@@ -1089,190 +1331,286 @@ mat4 matrixCompMult (mat4 m, mat4 n) {
     return mat4 (m[0] * n[0], m[1] * n[1], m[2] * n[2], m[3] * n[3]);
 }
 
+
+
+
 //
 // 8.6 Vector Relational Functions
 //
 
-bvec2 lessThan (vec2 v, vec2 u) {
-    return bvec2 (v.x < u.x, v.y < u.y);
+//// lessThan
+
+bvec2 lessThan(const vec2 v, const vec2 u)
+{
+   __asm vec4_sgt __retVal.xy, u, v;
 }
 
-bvec3 lessThan (vec3 v, vec3 u) {
-    return bvec3 (v.x < u.x, v.y < u.y, v.z < u.z);
+bvec3 lessThan(const vec3 v, const vec3 u)
+{
+   __asm vec4_sgt __retVal.xyz, u, v;
 }
 
-bvec4 lessThan (vec4 v, vec4 u) {
-    return bvec4 (v.x < u.x, v.y < u.y, v.z < u.z, v.w < u.w);
+bvec4 lessThan(const vec4 v, const vec4 u)
+{
+   __asm vec4_sgt __retVal, u, v;
 }
 
-bvec2 lessThan (ivec2 v, ivec2 u) {
-    return bvec2 (v.x < u.x, v.y < u.y);
+bvec2 lessThan(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_sgt __retVal.xy, u, v;
 }
 
-bvec3 lessThan (ivec3 v, ivec3 u) {
-    return bvec3 (v.x < u.x, v.y < u.y, v.z < u.z);
+bvec3 lessThan(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_sgt __retVal.xyz, u, v;
 }
 
-bvec4 lessThan (ivec4 v, ivec4 u) {
-    return bvec4 (v.x < u.x, v.y < u.y, v.z < u.z, v.w < u.w);
+bvec4 lessThan(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_sgt __retVal, u, v;
 }
 
-bvec2 lessThanEqual (vec2 v, vec2 u) {
-    return bvec2 (v.x <= u.x, v.y <= u.y);
+
+//// lessThanEqual
+
+bvec2 lessThanEqual(const vec2 v, const vec2 u)
+{
+   __asm vec4_sge __retVal.xy, u, v;
 }
 
-bvec3 lessThanEqual (vec3 v, vec3 u) {
-    return bvec3 (v.x <= u.x, v.y <= u.y, v.z <= u.z);
+bvec3 lessThanEqual(const vec3 v, const vec3 u)
+{
+   __asm vec4_sge __retVal.xyz, u, v;
 }
 
-bvec4 lessThanEqual (vec4 v, vec4 u) {
-    return bvec4 (v.x <= u.x, v.y <= u.y, v.z <= u.z, v.w <= u.w);
+bvec4 lessThanEqual(const vec4 v, const vec4 u)
+{
+   __asm vec4_sge __retVal, u, v;
 }
 
-bvec2 lessThanEqual (ivec2 v, ivec2 u) {
-    return bvec2 (v.x <= u.x, v.y <= u.y);
+bvec2 lessThanEqual(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_sge __retVal.xy, u, v;
 }
 
-bvec3 lessThanEqual (ivec3 v, ivec3 u) {
-    return bvec3 (v.x <= u.x, v.y <= u.y, v.z <= u.z);
+bvec3 lessThanEqual(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_sge __retVal.xyz, u, v;
 }
 
-bvec4 lessThanEqual (ivec4 v, ivec4 u) {
-    return bvec4 (v.x <= u.x, v.y <= u.y, v.z <= u.z, v.w <= u.w);
+bvec4 lessThanEqual(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_sge __retVal, u, v;
 }
 
-bvec2 greaterThan (vec2 v, vec2 u) {
-    return bvec2 (v.x > u.x, v.y > u.y);
+
+//// greaterThan
+
+bvec2 greaterThan(const vec2 v, const vec2 u)
+{
+   __asm vec4_sgt __retVal.xy, v, u;
 }
 
-bvec3 greaterThan (vec3 v, vec3 u) {
-    return bvec3 (v.x > u.x, v.y > u.y, v.z > u.z);
+bvec3 greaterThan(const vec3 v, const vec3 u)
+{
+   __asm vec4_sgt __retVal.xyz, v, u;
 }
 
-bvec4 greaterThan (vec4 v, vec4 u) {
-    return bvec4 (v.x > u.x, v.y > u.y, v.z > u.z, v.w > u.w);
+bvec4 greaterThan(const vec4 v, const vec4 u)
+{
+   __asm vec4_sgt __retVal, v, u;
 }
 
-bvec2 greaterThan (ivec2 v, ivec2 u) {
-    return bvec2 (v.x > u.x, v.y > u.y);
+bvec2 greaterThan(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_sgt __retVal.xy, v, u;
 }
 
-bvec3 greaterThan (ivec3 v, ivec3 u) {
-    return bvec3 (v.x > u.x, v.y > u.y, v.z > u.z);
+bvec3 greaterThan(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_sgt __retVal.xyz, v, u;
 }
 
-bvec4 greaterThan (ivec4 v, ivec4 u) {
-   return bvec4 (v.x > u.x, v.y > u.y, v.z > u.z, v.w > u.w);
+bvec4 greaterThan(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_sgt __retVal, v, u;
 }
 
-bvec2 greaterThanEqual (vec2 v, vec2 u) {
-    return bvec2 (v.x >= u.x, v.y >= u.y);
+
+//// greaterThanEqual
+
+bvec2 greaterThanEqual(const vec2 v, const vec2 u)
+{
+   __asm vec4_sge __retVal.xy, v, u;
 }
 
-bvec3 greaterThanEqual (vec3 v, vec3 u) {
-    return bvec3 (v.x >= u.x, v.y >= u.y, v.z >= u.z);
+bvec3 greaterThanEqual(const vec3 v, const vec3 u)
+{
+   __asm vec4_sge __retVal.xyz, v, u;
 }
 
-bvec4 greaterThanEqual (vec4 v, vec4 u) {
-    return bvec4 (v.x >= u.x, v.y >= u.y, v.z >= u.z, v.w >= u.w);
+bvec4 greaterThanEqual(const vec4 v, const vec4 u)
+{
+   __asm vec4_sge __retVal, v, u;
 }
 
-bvec2 greaterThanEqual (ivec2 v, ivec2 u) {
-    return bvec2 (v.x >= u.x, v.y >= u.y);
+bvec2 greaterThanEqual(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_sge __retVal.xy, v, u;
 }
 
-bvec3 greaterThanEqual (ivec3 v, ivec3 u) {
-    return bvec3 (v.x >= u.x, v.y >= u.y, v.z >= u.z);
+bvec3 greaterThanEqual(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_sge __retVal.xyz, v, u;
 }
 
-bvec4 greaterThanEqual (ivec4 v, ivec4 u) {
-    return bvec4 (v.x >= u.x, v.y >= u.y, v.z >= u.z, v.w >= u.w);
+bvec4 greaterThanEqual(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_sge __retVal, v, u;
 }
 
-bvec2 equal (vec2 v, vec2 u) {
-    return bvec2 (v.x == u.x, v.y == u.y);
+
+//// equal
+
+bvec2 equal(const vec2 v, const vec2 u)
+{
+   __asm vec4_seq __retVal.xy, v, u;
 }
 
-bvec3 equal (vec3 v, vec3 u) {
-    return bvec3 (v.x == u.x, v.y == u.y, v.z == u.z);
+bvec3 equal(const vec3 v, const vec3 u)
+{
+   __asm vec4_seq __retVal.xyz, v, u;
 }
 
-bvec4 equal (vec4 v, vec4 u) {
-    return bvec4 (v.x == u.x, v.y == u.y, v.z == u.z, v.w == u.w);
+bvec4 equal(const vec4 v, const vec4 u)
+{
+   __asm vec4_seq __retVal, v, u;
 }
 
-bvec2 equal (ivec2 v, ivec2 u) {
-    return bvec2 (v.x == u.x, v.y == u.y);
+bvec2 equal(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_seq __retVal.xy, v, u;
 }
 
-bvec3 equal (ivec3 v, ivec3 u) {
-    return bvec3 (v.x == u.x, v.y == u.y, v.z == u.z);
+bvec3 equal(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_seq __retVal.xyz, v, u;
 }
 
-bvec4 equal (ivec4 v, ivec4 u) {
-    return bvec4 (v.x == u.x, v.y == u.y, v.z == u.z, v.w == u.w);
+bvec4 equal(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_seq __retVal, v, u;
 }
 
-bvec2 notEqual (vec2 v, vec2 u) {
-    return bvec2 (v.x != u.x, v.y != u.y);
+
+//// notEqual
+
+bvec2 notEqual(const vec2 v, const vec2 u)
+{
+   __asm vec4_sne __retVal.xy, v, u;
 }
 
-bvec3 notEqual (vec3 v, vec3 u) {
-    return bvec3 (v.x != u.x, v.y != u.y, v.z != u.z);
+bvec3 notEqual(const vec3 v, const vec3 u)
+{
+   __asm vec4_sne __retVal.xyz, v, u;
 }
 
-bvec4 notEqual (vec4 v, vec4 u) {
-    return bvec4 (v.x != u.x, v.y != u.y, v.z != u.z, v.w != u.w);
+bvec4 notEqual(const vec4 v, const vec4 u)
+{
+   __asm vec4_sne __retVal, v, u;
 }
 
-bvec2 notEqual (ivec2 v, ivec2 u) {
-    return bvec2 (v.x != u.x, v.y != u.y);
+bvec2 notEqual(const ivec2 v, const ivec2 u)
+{
+   __asm vec4_sne __retVal.xy, v, u;
 }
 
-bvec3 notEqual (ivec3 v, ivec3 u) {
-    return bvec3 (v.x != u.x, v.y != u.y, v.z != u.z);
+bvec3 notEqual(const ivec3 v, const ivec3 u)
+{
+   __asm vec4_sne __retVal.xyz, v, u;
 }
 
-bvec4 notEqual (ivec4 v, ivec4 u) {
-    return bvec4 (v.x != u.x, v.y != u.y, v.z != u.z, v.w != u.w);
+bvec4 notEqual(const ivec4 v, const ivec4 u)
+{
+   __asm vec4_sne __retVal, v, u;
 }
 
-bool any (bvec2 v) {
-    return v.x || v.y;
+
+//// any
+
+bool any(const bvec2 v)
+{
+   float sum;
+   __asm vec4_add sum.x, v.x, v.y;
+   __asm vec4_sne __retVal.x, sum.x, 0.0;
 }
 
-bool any (bvec3 v) {
-    return v.x || v.y || v.z;
+bool any(const bvec3 v)
+{
+   float sum;
+   __asm vec4_add sum.x, v.x, v.y;
+   __asm vec4_add sum.x, sum.x, v.z;
+   __asm vec4_sne __retVal.x, sum.x, 0.0;
 }
 
-bool any (bvec4 v) {
-    return v.x || v.y || v.z || v.w;
+bool any(const bvec4 v)
+{
+   float sum;
+   __asm vec4_add sum.x, v.x, v.y;
+   __asm vec4_add sum.x, sum.x, v.z;
+   __asm vec4_add sum.x, sum.x, v.w;
+   __asm vec4_sne __retVal.x, sum.x, 0.0;
 }
 
-bool all (bvec2 v) {
+
+//// all
+
+bool all (const vec2 v)
+{
+   float prod;
+   __asm vec4_multiply prod.x, v.x, v.y;
+   __asm vec4_sne __retVal.x, prod.x, 0.0;
     return v.x && v.y;
 }
 
-bool all (bvec3 v) {
-    return v.x && v.y && v.z;
+bool all (const bvec3 v)
+{
+   float prod;
+   __asm vec4_multiply prod.x, v.x, v.y;
+   __asm vec4_multiply prod.x, prod.x, v.z;
+   __asm vec4_sne __retVal.x, prod.x, 0.0;
 }
 
-bool all (bvec4 v) {
-    return v.x && v.y && v.z && v.w;
+bool all (const bvec4 v)
+{
+   float prod;
+   __asm vec4_multiply prod.x, v.x, v.y;
+   __asm vec4_multiply prod.x, prod.x, v.z;
+   __asm vec4_multiply prod.x, prod.x, v.w;
+   __asm vec4_sne __retVal.x, prod.x, 0.0;
 }
 
-bvec2 not (bvec2 v) {
-    return bvec2 (!v.x, !v.y);
+
+
+//// not
+
+bvec2 not (const bvec2 v)
+{
+   __asm vec4_seq __retVal.xy, v, 0.0;
 }
 
-bvec3 not (bvec3 v) {
-    return bvec3 (!v.x, !v.y, !v.z);
+bvec3 not (const bvec3 v)
+{
+   __asm vec4_seq __retVal.xyz, v, 0.0;
 }
 
-bvec4 not (bvec4 v) {
-    return bvec4 (!v.x, !v.y, !v.z, !v.w);
+bvec4 not (const bvec4 v)
+{
+   __asm vec4_seq __retVal, v, 0.0;
 }
 
+
+
 //
 // 8.7 Texture Lookup Functions
 //
diff --git a/src/mesa/shader/slang/library/slang_core.gc b/src/mesa/shader/slang/library/slang_core.gc
index 7a721a5a1d0..8f1b0c2d3c3 100755
--- a/src/mesa/shader/slang/library/slang_core.gc
+++ b/src/mesa/shader/slang/library/slang_core.gc
@@ -23,69 +23,93 @@
  */
 
 //
-// This file defines nearly all constructors and operators for built-in data types, using
-// extended language syntax. In general, compiler treats constructors and operators as
-// ordinary functions with some exceptions. For example, the language does not allow
-// functions to be called in constant expressions - here the exception is made to allow it.
+// This file defines nearly all constructors and operators for built-in data
+// types, using extended language syntax. In general, compiler treats
+// constructors and operators as ordinary functions with some exceptions.
+// For example, the language does not allow functions to be called in
+// constant expressions - here the exception is made to allow it.
 //
-// Each implementation provides its own version of this file. Each implementation can define
-// the required set of operators and constructors in its own fashion.
+// Each implementation provides its own version of this file. Each
+// implementation can define the required set of operators and constructors
+// in its own fashion.
 //
-// The extended language syntax is only present when compiling this file. It is implicitly
-// included at the very beginning of the compiled shader, so no built-in functions can be
-// used.
+// The extended language syntax is only present when compiling this file.
+// It is implicitly included at the very beginning of the compiled shader,
+// so no built-in functions can be used.
 //
-// To communicate with the implementation, a special extended "__asm" keyword is used, followed
-// by an instruction name (any valid identifier), a destination variable identifier and a
-// a list of zero or more source variable identifiers. A variable identifier is a variable name
-// declared earlier in the code (as a function parameter, local or global variable).
-// An instruction name designates an instruction that must be exported by the implementation.
-// Each instruction receives data from source variable identifiers and returns data in the
-// destination variable identifier.
+// To communicate with the implementation, a special extended "__asm" keyword
+// is used, followed by an instruction name (any valid identifier), a
+// destination variable identifier and a list of zero or more source
+// variable identifiers.
 //
-// It is up to the implementation how to define a particular operator or constructor. If it is
-// expected to being used rarely, it can be defined in terms of other operators and constructors,
+// A variable identifier is a variable name declared earlier in the code
+// (as a function parameter, local or global variable).
+//
+// An instruction name designates an instruction that must be exported
+// by the implementation.  Each instruction receives data from source
+// variable identifiers and returns data in the destination variable
+// identifier.
+//
+// It is up to the implementation how to define a particular operator
+// or constructor. If it is expected to being used rarely, it can be
+// defined in terms of other operators and constructors,
 // for example:
 //
 // ivec2 __operator + (const ivec2 x, const ivec2 y) {
 //    return ivec2 (x[0] + y[0], x[1] + y[1]);
 // }
 //
-// If a particular operator or constructor is expected to be used very often or is an atomic
-// operation (that is, an operation that cannot be expressed in terms of other operations or
-// would create a dependency cycle) it must be defined using one or more __asm constructs.
+// If a particular operator or constructor is expected to be used very
+// often or is an atomic operation (that is, an operation that cannot be
+// expressed in terms of other operations or would create a dependency
+// cycle) it must be defined using one or more __asm constructs.
 //
-// Each implementation must define constructors for all scalar types (bool, float, int).
-// There are 9 scalar-to-scalar constructors (including identity constructors). However,
-// since the language introduces special constructors (like matrix constructor with a single
+// Each implementation must define constructors for all scalar types
+// (bool, float, int).  There are 9 scalar-to-scalar constructors
+// (including identity constructors). However, since the language
+// introduces special constructors (like matrix constructor with a single
 // scalar value), implementations must also implement these cases.
 // The compiler provides the following algorithm when resolving a constructor:
 // - try to find a constructor with a prototype matching ours,
-// - if no constructor is found and this is a scalar-to-scalar constructor, raise an error,
+// - if no constructor is found and this is a scalar-to-scalar constructor,
+//   raise an error,
 // - if a constructor is found, execute it and return,
-// - count the size of the constructor parameter list - if it is less than the size of
-//   our constructor's type, raise an error,
-// - for each parameter in the list do a recursive constructor matching for appropriate
-//   scalar fields in the constructed variable,
+// - count the size of the constructor parameter list - if it is less than
+//   the size of our constructor's type, raise an error,
+// - for each parameter in the list do a recursive constructor matching for
+//   appropriate scalar fields in the constructed variable,
 //
-// Each implementation must also define a set of operators that deal with built-in data types.
+// Each implementation must also define a set of operators that deal with
+// built-in data types.
 // There are four kinds of operators:
-// 1) Operators that are implemented only by the compiler: "()" (function call), "," (sequence)
-//    and "?:" (selection).
-// 2) Operators that are implemented by the compiler by expressing it in terms of other operators:
+// 1) Operators that are implemented only by the compiler: "()" (function
+//    call), "," (sequence) and "?:" (selection).
+// 2) Operators that are implemented by the compiler by expressing it in
+//    terms of other operators:
 //    - "." (field selection) - translated to subscript access,
-//    - "&&" (logical and) - translated to "<left_expr> ? <right_expr> : false",
+//    - "&&" (logical and) - translated to "<left_expr> ? <right_expr> :
+//      false",
 //    - "||" (logical or) - translated to "<left_expr> ? true : <right_expr>",
-// 3) Operators that can be defined by the implementation and if the required prototype is not
-//    found, standard behaviour is used:
-//    - "==", "!=", "=" (equality, assignment) - compare or assign matching fields one-by-one;
-//      note that at least operators for scalar data types must be defined by the implementation
-//      to get it work,
-// 4) All other operators not mentioned above. If no required prototype is found, an error is
-//    raised. An implementation must follow the language specification to provide all valid
-//    operator prototypes.
+// 3) Operators that can be defined by the implementation and if the required
+//    prototype is not found, standard behaviour is used:
+//    - "==", "!=", "=" (equality, assignment) - compare or assign
+//      matching fields one-by-one;
+//      note that at least operators for scalar data types must be defined
+//      by the implementation to get it work,
+// 4) All other operators not mentioned above. If no required prototype is
+//    found, an error is raised. An implementation must follow the language
+//    specification to provide all valid operator prototypes.
 //
 
+//bp:
+vec4 vec4(const float a1, const float b1, const float c1, const float d1)
+{
+   __retVal.x = a1;
+   __retVal.y = b1;
+   __retVal.z = c1;
+   __retVal.w = d1;
+}
+
 int __constructor (const float f) {
     int i;
     __asm float_to_int i, f;
@@ -154,6 +178,7 @@ vec3 __constructor (const bool b) {
     return vec3 (b ? 1.0 : 0.0);
 }
 
+//bp: TODO replace with asm == f.xxxx
 vec4 __constructor (const float f) {
     return vec4 (f, f, f, f);
 }
@@ -307,9 +332,11 @@ void __operator /= (inout float a, const float b) {
 }
 
 float __operator + (const float a, const float b) {
-    float c;
-    __asm float_add c, a, b;
-    return c;
+//    float c;
+//    __asm float_add c, a, b;
+//    return c;
+//bp:
+   __asm float_add __retVal, a, b;
 }
 
 void __operator += (inout int a, const int b) {
@@ -330,9 +357,11 @@ void __operator -= (inout int a, const int b) {
 }
 
 float __operator * (const float a, const float b) {
-    float c;
-    __asm float_multiply c, a, b;
-    return c;
+//    float c;
+//    __asm float_multiply c, a, b;
+//    return c;
+//bp:
+    __asm float_multiply __retVal, a, b;
 }
 
 void __operator *= (inout int a, const int b) {
@@ -340,9 +369,11 @@ void __operator *= (inout int a, const int b) {
 }
 
 float __operator / (const float a, const float b) {
-    float c;
-    __asm float_divide c, a, b;
-    return c;
+//    float c;
+//    __asm float_divide c, a, b;
+//    return c;
+//bp:
+   __asm float_divide __retVal, a, b;
 }
 
 void __operator /= (inout int a, const int b) {
@@ -535,12 +566,22 @@ void __operator -= (inout mat3 m, const mat3 n) {
     m[2] -= n[2];
 }
 
-vec3 __operator * (const mat3 m, const vec3 v) {
-    return vec3 (
-        v.x * m[0].x + v.y * m[1].x + v.z * m[2].x,
-        v.x * m[0].y + v.y * m[1].y + v.z * m[2].y,
-        v.x * m[0].z + v.y * m[1].z + v.z * m[2].z
-    );
+//bp:
+vec3 __operator * (const mat3 m, const vec3 v)
+{
+   vec3 r1, r2, r3;
+   r1.x = m[0].x;
+   r1.y = m[1].x;
+   r1.z = m[2].x;
+   r2.x = m[0].y;
+   r2.y = m[1].y;
+   r2.z = m[2].y;
+   r3.x = m[0].z;
+   r3.y = m[1].z;
+   r3.z = m[2].z;
+   __asm vec3_dot __retVal.x, r1, v;
+   __asm vec3_dot __retVal.y, r2, v;
+   __asm vec3_dot __retVal.z, r3, v;
 }
 
 mat3 __operator * (const mat3 m, const mat3 n) {
@@ -571,13 +612,57 @@ void __operator -= (inout mat4 m, const mat4 n) {
     m[3] -= n[3];
 }
 
-vec4 __operator * (const mat4 m, const vec4 v) {
-    return vec4 (
-        v.x * m[0].x + v.y * m[1].x + v.z * m[2].x + v.w * m[3].x,
-        v.x * m[0].y + v.y * m[1].y + v.z * m[2].y + v.w * m[3].y,
-        v.x * m[0].z + v.y * m[1].z + v.z * m[2].z + v.w * m[3].z,
-        v.x * m[0].w + v.y * m[1].w + v.z * m[2].w + v.w * m[3].w
-    );
+
+
+
+//// dot  (formerly in slang_common_builtin.gc)
+
+float dot(const float a, const float b)
+{
+   return a * b;
+}
+
+float dot(const vec2 a, const vec2 b)
+{
+   return a.x * b.x + a.y * b.y;
+}
+
+float dot(const vec3 a, const vec3 b)
+{
+    __asm vec3_dot __retVal, a, b;
+}
+
+float dot(const vec4 a, const vec4 b)
+{
+    __asm vec4_dot __retVal, a, b;
+}
+
+
+
+
+vec4 __operator * (const mat4 m, const vec4 v)
+{
+   vec4 r1, r2, r3, r4;
+   r1.x = m[0].x;
+   r1.y = m[1].x;
+   r1.z = m[2].x;
+   r1.w = m[3].x;
+   r2.x = m[0].y;
+   r2.y = m[1].y;
+   r2.z = m[2].y;
+   r2.w = m[3].y;
+   r3.x = m[0].z;
+   r3.y = m[1].z;
+   r3.z = m[2].z;
+   r3.w = m[3].z;
+   r4.x = m[0].w;
+   r4.y = m[1].w;
+   r4.z = m[2].w;
+   r4.w = m[3].w;
+   __asm vec4_dot __retVal.x, r1, v;
+   __asm vec4_dot __retVal.y, r2, v;
+   __asm vec4_dot __retVal.z, r3, v;
+   __asm vec4_dot __retVal.w, r4, v;
 }
 
 mat4 __operator * (const mat4 m, const mat4 n) {
@@ -768,6 +853,11 @@ vec4 __operator * (const vec4 v, const mat4 m) {
         v.x * m[1].x + v.y * m[1].y + v.z * m[1].z + v.w * m[1].w,
         v.x * m[2].x + v.y * m[2].y + v.z * m[2].z + v.w * m[2].w,
         v.x * m[3].x + v.y * m[3].y + v.z * m[3].z + v.w * m[3].w
+//bp:
+//          dot(v, m[0]),
+//          dot(v, m[1]),
+//          dot(v, m[2]),
+//          dot(v, m[3])
     );
 }
 
@@ -776,10 +866,12 @@ void __operator *= (inout vec4 v, const mat4 m) {
 }
 
 float __operator - (const float a, const float b) {
-    float c;
-    __asm float_negate c, b;
-    __asm float_add    c, a, c;
-    return c;
+//    float c;
+//    __asm float_negate c, b;
+//    __asm float_add    c, a, c;
+//    return c;
+//bp:
+   __asm float_subtract __retVal, a, b;
 }
 
 int __operator + (const int a, const int b) {
@@ -855,8 +947,10 @@ vec3 __operator / (const vec3 v, const vec3 u) {
     return vec3 (v.x / u.x, v.y / u.y, v.z / u.z);
 }
 
-vec4 __operator + (const vec4 v, const vec4 u) {
-    return vec4 (v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w);
+vec4 __operator + (const vec4 vadd, const vec4 uadd) {
+//    return vec4 (v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w);
+//bp:
+   __asm vec4_add __retVal, vadd, uadd;
 }
 
 vec4 __operator - (const vec4 v, const vec4 u) {
@@ -864,7 +958,10 @@ vec4 __operator - (const vec4 v, const vec4 u) {
 }
 
 vec4 __operator * (const vec4 v, const vec4 u) {
-    return vec4 (v.x * u.x, v.y * u.y, v.z * u.z, v.w * u.w);
+//    return vec4 (v.x * u.x, v.y * u.y, v.z * u.z, v.w * u.w);
+//    return v;
+//bp:
+    __asm vec4_multiply __retVal, v, u;
 }
 
 vec4 __operator / (const vec4 v, const vec4 u) {
@@ -1007,8 +1104,10 @@ vec3 __operator * (const float a, const vec3 u) {
     return vec3 (a * u.x, a * u.y, a * u.z);
 }
 
-vec3 __operator * (const vec3 v, const float b) {
-    return vec3 (v.x * b, v.y * b, v.z * b);
+//bp:
+vec3 __operator * (const vec3 v, const float b)
+{
+   __retVal.xyz = v.xyz * b.xxx;
 }
 
 vec3 __operator / (const float a, const vec3 u) {
@@ -1039,8 +1138,10 @@ vec4 __operator * (const float a, const vec4 u) {
     return vec4 (a * u.x, a * u.y, a * u.z, a * u.w);
 }
 
-vec4 __operator * (const vec4 v, const float b) {
-    return vec4 (v.x * b, v.y * b, v.z * b, v.w * b);
+//bp:
+vec4 __operator * (const vec4 v, const float b)
+{
+    __asm vec4_multiply __retVal.xyzw, v.xyzw, b.xxxx;
 }
 
 vec4 __operator / (const float a, const vec4 u) {