diff options
author | Ben Skeggs <[email protected]> | 2014-05-09 15:55:47 +1000 |
---|---|---|
committer | Ben Skeggs <[email protected]> | 2014-05-15 09:54:12 +1000 |
commit | 0079a375a58b288caacc2721f5a34b8f1233e7d1 (patch) | |
tree | 4d7b244b3cb826e9cbccf090fe549fea51351736 /src/gallium/drivers/nouveau/codegen/lib | |
parent | 737477dac33d68b00b34019258d663945fbfeb56 (diff) |
nvc0: allow for easier modification of compiler library routines
Signed-off-by: Ben Skeggs <[email protected]>
Reviewed-by: Ilia Mirkin <[email protected]>
Diffstat (limited to 'src/gallium/drivers/nouveau/codegen/lib')
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/Makefile | 10 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gf100.asm | 107 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h | 63 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gk104.asm | 711 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h | 598 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gk110.asm | 98 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h | 81 |
7 files changed, 1668 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/lib/Makefile b/src/gallium/drivers/nouveau/codegen/lib/Makefile new file mode 100644 index 00000000000..28a41a3f41e --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/Makefile @@ -0,0 +1,10 @@ +ENVYAS ?= envyas + +all: gf100.asm.h gk104.asm.h gk110.asm.h + +gf100.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@ +gk104.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@ +gk110.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mgk110 $< -o $@ diff --git a/src/gallium/drivers/nouveau/codegen/lib/gf100.asm b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm new file mode 100644 index 00000000000..cf393b1bf56 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm @@ -0,0 +1,107 @@ +.section #gf100_builtin_code +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +gf100_div_u32: + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + ret + +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +gf100_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + cvt s32 $r0 abs s32 $r0 + cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p3 cvt s32 $r0 neg s32 $r0 + $p2 cvt s32 $r1 neg s32 $r1 + ret + +// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rcp(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 9 * 8 bytes +// +gf100_rcp_f64: + nop + ret + +// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rsqrt(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 14 * 8 bytes +// +gf100_rsq_f64: + nop + ret + +.section #gf100_builtin_offsets +.b64 #gf100_div_u32 +.b64 #gf100_div_s32 +.b64 #gf100_rcp_f64 +.b64 #gf100_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h new file mode 100644 index 00000000000..00fe5eab1c2 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h @@ -0,0 +1,63 @@ +uint64_t gf100_builtin_code[] = { +/* 0x0000: gf100_div_u32 */ + 0x7800000004009c03, + 0x0010dd187c209cdd, + 0x6000000008309c03, + 0x0810dc2a05605c18, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x0010430d05609c18, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x90001dff040000ac, +/* 0x00b0: gf100_div_s32 */ + 0x188e0000fc05dc23, + 0x18c40000fc17dc23, + 0x07305e1803301e18, + 0x7800000004009c03, + 0x0010dd187c209cdd, + 0x6000000008309c03, + 0x0810dc2a05605c18, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x0010430d05609c18, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x01700e18040000ac, + 0x90001dff05704a18, +/* 0x0180: gf100_rcp_f64 */ + 0x90001dff00001c08, +/* 0x0188: gf100_rsq_f64 */ + 0x90001dff00001c08, +}; + +uint64_t gf100_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000b0, + 0x0000000000000180, + 0x0000000000000188, +}; diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm new file mode 100644 index 00000000000..cd65b547279 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm @@ -0,0 +1,711 @@ +.section #gk104_builtin_code +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +gk104_div_u32: + sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28 + bfind u32 $r2 $r1 + long xor b32 $r2 $r2 0x1f + long mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + long cvt u32 $r1 neg u32 $r1 + long mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + long cvt u32 $r2 neg u32 $r1 + long add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + long ret + +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +gk104_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28 + long cvt s32 $r0 abs s32 $r0 + long cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + long xor b32 $r2 $r2 0x1f + long mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + long cvt u32 $r2 neg u32 $r1 + long add $r1 (mul u32 $r1 u32 $r0) $r3 + sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + long $p0 add b32 $r0 $r0 0x1 + long $p3 cvt s32 $r0 neg s32 $r0 + sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c + $p2 cvt s32 $r1 neg s32 $r1 + long ret + +// SULDP [for each format] +// $r4d: address +// $r2: surface info (format) +// $p0: access predicate +// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg) +// +// RGBA32 +$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0 +long ret +// RGBA16_UNORM +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0 +cvt rn f32 $r3 u16 1 $r1 +cvt rn f32 $r2 u16 0 $r1 +mul f32 $r3 $r3 0x37800074 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt rn f32 $r1 u16 1 $r0 +mul f32 $r2 $r2 0x37800074 +cvt rn f32 $r0 u16 0 $r0 +mul f32 $r1 $r1 0x37800074 +mul f32 $r0 $r0 0x37800074 +long ret +// RGBA16_SNORM +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +cvt rn f32 $r3 s16 1 $r1 +cvt rn f32 $r2 s16 0 $r1 +mul f32 $r3 $r3 0x38000187 +cvt rn f32 $r1 s16 1 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r2 $r2 0x38000187 +cvt rn f32 $r0 s16 0 $r0 +mul f32 $r1 $r1 0x38000187 +mul f32 $r0 $r0 0x38000187 +long ret +// RGBA16_SINT +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +cvt s32 $r3 s16 1 $r1 +cvt s32 $r2 s16 0 $r1 +cvt s32 $r1 s16 1 $r0 +cvt s32 $r0 s16 0 $r0 +long ret +// RGBA16_UINT +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +cvt u32 $r3 u16 1 $r1 +cvt u32 $r2 u16 0 $r1 +cvt u32 $r1 u16 1 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt u32 $r0 u16 0 $r0 +long ret +// RGBA16_FLOAT +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +cvt f32 $r3 f16 $r1 1 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt f32 $r2 f16 $r1 0 +cvt f32 $r1 f16 $r0 1 +cvt f32 $r0 f16 $r0 0 +long ret +// RG32_FLOAT +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r2 0x00000000 +long mov b32 $r3 0x3f800000 +long ret +// RG32_xINT +$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r2 0x00000000 +long mov b32 $r3 0x00000001 +long ret +// RGB10A2_UNORM +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +ext u32 $r1 $r0 0x0a0a +long mov b32 $r3 0x3f800000 +ext u32 $r2 $r0 0x0a14 +long and b32 $r0 $r0 0x3ff +cvt rn f32 $r2 u16 0 $r2 +cvt rn f32 $r1 u16 0 $r1 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r2 $r2 0x3a802007 +cvt rn f32 $r0 u16 0 $r0 +mul f32 $r1 $r1 0x3a802007 +mul f32 $r0 $r0 0x3a802007 +long ret +// RGB10A2_UINT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +ext u32 $r1 $r0 0x0a0a +long mov b32 $r3 0x00000001 +ext u32 $r2 $r0 0x0a14 +long and b32 $r0 $r0 0x3ff +long ret +// RGBA8_UNORM +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +cvt rn f32 $r3 u8 3 $r0 +cvt rn f32 $r2 u8 2 $r0 +mul f32 $r3 $r3 0x3b808081 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt rn f32 $r1 u8 1 $r0 +mul f32 $r2 $r2 0x3b808081 +cvt rn f32 $r0 u8 0 $r0 +mul f32 $r1 $r1 0x3b808081 +mul f32 $r0 $r0 0x3b808081 +long ret +// RGBA8_SNORM +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +cvt rn f32 $r3 s8 3 $r0 +cvt rn f32 $r2 s8 2 $r0 +mul f32 $r3 $r3 0x3c010204 +cvt rn f32 $r1 s8 1 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r2 $r2 0x3c010204 +cvt rn f32 $r0 s8 0 $r0 +mul f32 $r1 $r1 0x3c010204 +mul f32 $r0 $r0 0x3c010204 +long ret +// RGBA8_SINT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +cvt s32 $r3 s8 3 $r0 +cvt s32 $r2 s8 2 $r0 +cvt s32 $r1 s8 1 $r0 +cvt s32 $r0 s8 0 $r0 +long ret +// RGBA8_UINT +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +cvt u32 $r3 u8 3 $r0 +cvt u32 $r2 u8 2 $r0 +cvt u32 $r1 u8 1 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt u32 $r0 u8 0 $r0 +long ret +// R5G6B5_UNORM +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +ext u32 $r1 $r0 0x0605 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long mov b32 $r3 0x3f800000 +ext u32 $r2 $r0 0x050b +long and b32 $r0 $r0 0x1f +cvt rn f32 $r2 u8 0 $r2 +cvt rn f32 $r1 u8 0 $r1 +mul f32 $r2 $r2 0x3d042108 +cvt rn f32 $r0 u8 0 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r1 $r1 0x3c820821 +mul f32 $r0 $r0 0x3d042108 +long ret +// R5G5B5X1_UNORM +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +ext u32 $r1 $r0 0x0505 +ext u32 $r2 $r0 0x050a +long and b32 $r0 $r0 0x1f +long mov b32 $r3 0x3f800000 +cvt rn f32 $r2 u8 0 $r2 +cvt rn f32 $r1 u8 0 $r1 +cvt rn f32 $r0 u8 0 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r2 $r2 0x3d042108 +mul f32 $r1 $r1 0x3d042108 +mul f32 $r0 $r0 0x3d042108 +long ret +// RG16_UNORM +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +cvt rn f32 $r1 u16 1 $r0 +cvt rn f32 $r0 u16 0 $r0 +mul f32 $r1 $r1 0x37800074 +mul f32 $r0 $r0 0x37800074 +long mov b32 $r2 0x00000000 +long mov b32 $r3 0x3f800000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long ret +// RG16_SNORM +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x3f800000 +cvt rn f32 $r1 s16 1 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mov b32 $r2 0x00000000 +cvt rn f32 $r0 s16 0 $r0 +mul f32 $r1 $r1 0x38000187 +mul f32 $r0 $r0 0x38000187 +long ret +// RG16_SINT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x00000001 +cvt s32 $r1 s16 1 $r0 +mov b32 $r2 0x00000000 +cvt s32 $r0 s16 0 $r0 +long ret +// RG16_UINT +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x00000001 +cvt u32 $r1 u16 1 $r0 +mov b32 $r2 0x00000000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt u32 $r0 u16 0 $r0 +long ret +// RG16_FLOAT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x3f800000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt f32 $r1 f16 $r0 1 +mov b32 $r2 0x00000000 +cvt f32 $r0 f16 $r0 0 +long ret +// R32_FLOAT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x3f800000 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +long ret +// R32_xINT +$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +long ret +// RG8_UNORM +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x3f800000 +cvt rn f32 $r1 u8 1 $r0 +mov b32 $r2 0x00000000 +cvt rn f32 $r0 u8 0 $r0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r1 $r1 0x3b808081 +mul f32 $r0 $r0 0x3b808081 +long ret +// RG8_SNORM +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long mov b32 $r3 0x3f800000 +cvt rn f32 $r1 s8 1 $r0 +long mov b32 $r2 0x00000000 +cvt rn f32 $r0 s8 0 $r0 +mul f32 $r1 $r1 0x3c010204 +mul f32 $r0 $r0 0x3c010204 +long ret +// RG8_UINT +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +cvt u32 $r1 u8 1 $r0 +long mov b32 $r2 0x00000000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt u32 $r0 u8 0 $r0 +long ret +// RG8_SINT +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +cvt s32 $r1 s8 1 $r0 +long mov b32 $r2 0x00000000 +cvt s32 $r0 s8 0 $r0 +long ret +// R16_UNORM +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x3f800000 +cvt rn f32 $r0 u16 0 $r0 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +mul f32 $r0 $r0 0x37800074 +long ret +// R16_SNORM +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x3f800000 +cvt rn f32 $r0 s16 0 $r0 +long mov b32 $r2 0x00000000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long mov b32 $r1 0x00000000 +mul f32 $r0 $r0 0x38000187 +long ret +// R16_SINT +$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long mov b32 $r3 0x00000001 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +long ret +// R16_UINT +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +long ret +// R16_FLOAT +$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x3f800000 +long mov b32 $r2 0x00000000 +cvt f32 $r0 f16 $r0 0 +mov b32 $r1 0x00000000 +long ret +// R8_UNORM +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0 +mov b32 $r3 0x3f800000 +cvt rn f32 $r0 u8 0 $r0 +mov b32 $r2 0x00000000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mul f32 $r0 $r0 0x3b808081 +mov b32 $r1 0x00000000 +long ret +// R8_SNORM +$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +mov b32 $r3 0x3f800000 +cvt rn f32 $r0 s8 0 $r0 +mov b32 $r2 0x00000000 +mul f32 $r0 $r0 0x3c010204 +mov b32 $r1 0x00000000 +long ret +// R8_SINT +$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +long ret +// R8_UINT +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x00000001 +long mov b32 $r2 0x00000000 +long mov b32 $r1 0x00000000 +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long ret +// R11G11B10_FLOAT TODO +$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0 +set $p1 0x1 $p1 xor not $p2 +$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0 +$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0 +long mov b32 $r3 0x3f800000 +long nop +long ret + + +// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rcp(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 9 * 8 bytes +// +gk104_rcp_f64: + long nop + long ret + +// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rsqrt(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 14 * 8 bytes +// +gk104_rsq_f64: + long nop + long ret + +// +// Trap handler. +// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs. +// Low 32 bytes of l[] memory shouldn't be used if resumeability is required. +// +// Trap info: +// 0x000: mutex +// 0x004: PC +// 0x008: trapstat +// 0x00c: warperr +// 0x010: tidx +// 0x014: tidy +// 0x018: tidz +// 0x01c: ctaidx +// 0x020: ctaidy +// 0x024: ctaidz +// 0x030: $r0q +// 0x130: $flags +// 0x140: s[] +// +st b128 wb l[0x00] $r0q +// check state of the warp and continue if it didn't cause the trap +long mov b32 $r1 $trapstat +long mov b32 $r3 $warperr +mov $r2 $flags mask 0xffff +and b32 0 $c $r1 $r3 +e $c bra #end_cont +// spill control flow stack to l[] +long mov b32 $r3 16 +spill_cfstack: +preret #end_exit +sub b32 $r3 $c $r3 0x1 +lg $c bra #spill_cfstack +// retrieve pointer to trap info +mov b32 $r0 c0[0x1900] +mov b32 $r1 c0[0x1904] +// we only let a single faulting thread store its state +mov b32 $r3 0x1 +exch b32 $r3 g[$r0d] $r3 +joinat #end_exit +set $p0 0x1 eq u32 $r3 0x1 +join $p0 nop +// store $c and $p registers +st b32 wb g[$r0d+0x130] $r2 +// store $trapstat and $warperr +long mov b32 $r2 $trapstat +long mov b32 $r3 $warperr +st b64 wb g[$r0d+0x8] $r2d +// store registers +st b128 wb g[$r0d+0x40] $r4q +st b128 wb g[$r0d+0x50] $r8q +st b128 wb g[$r0d+0x60] $r12q +st b128 wb g[$r0d+0x70] $r16q +st b128 wb g[$r0d+0x80] $r20q +st b128 wb g[$r0d+0x90] $r24q +st b128 wb g[$r0d+0xa0] $r28q +st b128 wb g[$r0d+0xb0] $r32q +st b128 wb g[$r0d+0xc0] $r36q +st b128 wb g[$r0d+0xd0] $r40q +st b128 wb g[$r0d+0xe0] $r44q +st b128 wb g[$r0d+0xf0] $r48q +st b128 wb g[$r0d+0x100] $r52q +st b128 wb g[$r0d+0x110] $r56q +st b128 wb g[$r0d+0x120] $r60q +ld b64 $r2d cs l[0x0] +st b64 wb g[$r0d+0x30] $r2d +ld b64 $r2d cs l[0x8] +st b64 wb g[$r0d+0x38] $r2d +// store thread id +long mov b32 $r2 $tidx +long mov b32 $r3 $tidy +st b64 wb g[$r0d+0x10] $r2d +long mov b32 $r2 $tidz +long mov b32 $r3 $ctaidx +st b64 wb g[$r0d+0x18] $r2d +long mov b32 $r2 $ctaidy +long mov b32 $r3 $ctaidz +st b64 wb g[$r0d+0x20] $r2d +// store shared memory (in reverse order so $r0d is base again at the end) +long mov b32 $r3 $smemsz +sub b32 $r3 $c $r3 0x4 +s $c bra #shared_done +add b32 $r0 $c $r0 $r3 +add b32 $r1 $r1 0x0 $c +shared_loop: +long ld b32 $r2 s[$r3] +long st b32 wb g[$r0d+0x140] $r2 +sub b32 $r0 $c $r0 0x4 +sub b32 $r1 $r1 0x0 $c +sub b32 $r3 $c $r3 0x4 +lg $c bra #shared_loop +shared_done: +// search the stack for trap entry to retrieve PC +mov b32 $r0 c0[0x1908] +mov b32 $r1 c0[0x190c] +membar sys +// invalidate caches so we can read stack entries via g[] +cctl ivall 0 l[0] +cctl ivall 0 g[$r0d] +// get offsets +mov b32 $r2 $physid +ext u32 $r3 $r2 0x0814 // MP id +ext u32 $r2 $r2 0x0608 // warp id +mul $r2 u32 $r2 u32 c0[0x1914] // warp offset +mul $r3 u32 $r3 u32 c0[0x1910] // MP offset +add b32 $r2 $r2 $r3 // MP + warp offset +add b32 $r0 $c $r0 $r2 +add b32 $r1 $r1 0x0 $c +search_cstack: +mov b32 $r3 c0[0x1918] // cstack size +ld u8 $r2 cv g[$r0d+0x8] +set $p0 0x1 eq u32 $r2 0xa +$p0 bra #entry_found +add b32 $r0 $c $r0 0x10 +add b32 $r1 $r1 0x0 $c +sub b32 $r3 $c $r3 0x10 +lg $c bra #search_cstack +bra #end_exit +entry_found: +// load PC (may be unaligned and spread out) +ld b32 $r2 cv g[$r0d] +mov b32 $r0 c0[0x1900] +mov b32 $r1 c0[0x1904] +st b32 wb g[$r0d+0x4] $r2 +join nop +// invalidate caches and exit +end_exit: +cctl ivall 0 g[0] +bpt pause 0x0 +rtt terminate +end_cont: +bpt pause 0x0 +mov $flags $r2 mask 0xffff +ld b128 $r0q cs l[0x00] +rtt + +.section #gk104_builtin_offsets +.b64 #gk104_div_u32 +.b64 #gk104_div_s32 +.b64 #gk104_rcp_f64 +.b64 #gk104_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h new file mode 100644 index 00000000000..37998768efe --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h @@ -0,0 +1,598 @@ +uint64_t gk104_builtin_code[] = { +/* 0x0000: gk104_div_u32 */ + 0x2282828042804287, + 0x7800000004009c03, + 0x380000007c209c82, + 0x180000000400dde2, + 0x6000000008309c03, + 0x1c00000005205d04, + 0x500000000810dc03, + 0x200400000c209c43, + 0x2282828282828287, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x2042c28280428047, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x1c00000005209d04, + 0x2006000000105c03, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x220282e20042c287, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x0800000004000002, + 0x9000000000001de7, +/* 0x00f0: gk104_div_s32 */ + 0x188e0000fc05dc23, + 0x18c40000fc17dc23, + 0x2280428042828207, + 0x1c00000001201ec4, + 0x1c00000005205ec4, + 0x7800000004009c03, + 0x380000007c209c82, + 0x180000000400dde2, + 0x6000000008309c03, + 0x1c00000005205d04, + 0x2282828282828287, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x2282804280428287, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x1c00000005209d04, + 0x2006000000105c03, + 0x22028042c28042c7, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x0800000004000002, + 0x1c00000001200f84, + 0x22c200428042e047, + 0x1c00000005204b84, + 0x9000000000001de7, + 0xd4004000084004c5, + 0x0c5400000013dc04, + 0xd4004000084009c5, + 0xd4004000084007c5, + 0x9000000000001de7, + 0x2000000000000007, + 0xd4004000084004c5, + 0x0c5400000013dc04, + 0xd4004000084009c5, + 0xd4004000084007c5, + 0x1900000004a0dc04, + 0x1800000004a09c04, + 0x30de0001d030dc02, + 0x2000000000000007, + 0x1900000000a05c04, + 0x30de0001d0209c02, + 0x1800000000a01c04, + 0x30de0001d0105c02, + 0x30de0001d0001c02, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1900000004a0de04, + 0x1800000004a09e04, + 0x30e000061c30dc02, + 0x1900000000a05e04, + 0x2000000000000007, + 0x30e000061c209c02, + 0x1800000000a01e04, + 0x30e000061c105c02, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1d00000004a0de84, + 0x1c00000004a09e84, + 0x1d00000000a05e84, + 0x1c00000000a01e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1d00000004a0dc04, + 0x1c00000004a09c04, + 0x1d00000000a05c04, + 0x2000000000000007, + 0x1c00000000a01c04, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1100000004a0dc04, + 0x2000000000000007, + 0x1000000004a09c04, + 0x1100000000a05c04, + 0x1000000000a01c04, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0x2000000000000007, + 0xd4004000084007a5, + 0x1800000000009de2, + 0x18fe00000000dde2, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0x2000000000000007, + 0xd4004000084007a5, + 0x1800000000009de2, + 0x180000000400dde2, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x7000c02828005c03, + 0x18fe00000000dde2, + 0x7000c02850009c03, + 0x3800000ffc001c02, + 0x1800000008a09c04, + 0x1800000004a05c04, + 0x2000000000000007, + 0x30ea00801c209c02, + 0x1800000000a01c04, + 0x30ea00801c105c02, + 0x30ea00801c001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x7000c02828005c03, + 0x180000000400dde2, + 0x7000c02850009c03, + 0x3800000ffc001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x198000000020dc04, + 0x1900000000209c04, + 0x30ee02020430dc02, + 0x2000000000000007, + 0x1880000000205c04, + 0x30ee020204209c02, + 0x1800000000201c04, + 0x30ee020204105c02, + 0x30ee020204001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x198000000020de04, + 0x1900000000209e04, + 0x30f004081030dc02, + 0x1880000000205e04, + 0x2000000000000007, + 0x30f0040810209c02, + 0x1800000000201e04, + 0x30f0040810105c02, + 0x30f0040810001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x1d8000000020de84, + 0x1d00000000209e84, + 0x1c80000000205e84, + 0x1c00000000201e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x1d8000000020dc04, + 0x1d00000000209c04, + 0x1c80000000205c04, + 0x2000000000000007, + 0x1c00000000201c04, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x7000c01814005c03, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x7000c0142c009c03, + 0x380000007c001c02, + 0x1800000008209c04, + 0x1800000004205c04, + 0x30f4108420209c02, + 0x1800000000201c04, + 0x2000000000000007, + 0x30f2082084105c02, + 0x30f4108420001c02, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x2000000000000007, + 0x7000c01414005c03, + 0x7000c01428009c03, + 0x380000007c001c02, + 0x18fe00000000dde2, + 0x1800000008209c04, + 0x1800000004205c04, + 0x1800000000201c04, + 0x2000000000000007, + 0x30f4108420209c02, + 0x30f4108420105c02, + 0x30f4108420001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x1900000000a05c04, + 0x1800000000a01c04, + 0x30de0001d0105c02, + 0x30de0001d0001c02, + 0x1800000000009de2, + 0x18fe00000000dde2, + 0x2000000000000007, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x1900000000a05e04, + 0x2000000000000007, + 0x1800000000009de2, + 0x1800000000a01e04, + 0x30e000061c105c02, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1d00000000a05e84, + 0x1800000000009de2, + 0x1c00000000a01e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1d00000000a05c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1c00000000a01c04, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x2000000000000007, + 0x1100000000a05c04, + 0x1800000000009de2, + 0x1000000000a01c04, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1880000000205c04, + 0x1800000000009de2, + 0x1800000000201c04, + 0x2000000000000007, + 0x30ee020204105c02, + 0x30ee020204001c02, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x1880000000205e04, + 0x1800000000009de2, + 0x1800000000201e04, + 0x30f0040810105c02, + 0x30f0040810001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x180000000400dde2, + 0x1c80000000205c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1c00000000201c04, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x180000000400dde2, + 0x2000000000000007, + 0x1c80000000205e84, + 0x1800000000009de2, + 0x1c00000000201e84, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0x2000000000000007, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000a01c04, + 0x1800000000009de2, + 0x1800000000005de2, + 0x30de0001d0001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000a01e04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1800000000005de2, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd400400008400465, + 0x0c5400000013dc04, + 0xd400400008400965, + 0xd400400008400765, + 0x2000000000000007, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0x2000000000000007, + 0xd400400008400745, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000009de2, + 0x1000000000a01c04, + 0x1800000000005de2, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x18fe00000000dde2, + 0x1800000000201c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x30ee020204001c02, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x1800000000201e04, + 0x1800000000009de2, + 0x30f0040810001c02, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400425, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400925, + 0xd400400008400725, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x2000000000000007, + 0x9000000000001de7, + 0xd40040000840c485, + 0x0c5400000013dc04, + 0xd40040000840c985, + 0xd40040000840c785, + 0x18fe00000000dde2, + 0x4000000000001de4, + 0x9000000000001de7, +/* 0x0f08: gk104_rcp_f64 */ + 0x4000000000001de4, + 0x9000000000001de7, +/* 0x0f18: gk104_rsq_f64 */ + 0x4000000000001de4, + 0x9000000000001de7, + 0xc800000003f01cc5, + 0x2c00000100005c04, + 0x2c0000010800dc04, + 0x3000c3fffff09c04, + 0x680100000c1fdc03, + 0x4000000a60001c47, + 0x180000004000dde2, +/* 0x0f60: spill_cfstack */ + 0x78000009c0000007, + 0x0c0000000430dd02, + 0x4003ffffa0001ca7, + 0x2800406400001de4, + 0x2800406410005de4, + 0x180000000400dde2, + 0x547e18000000dd05, + 0x60000008e0000007, + 0x190ec0000431dc03, + 0x40000000000001f4, + 0x94000004c0009c85, + 0x2c00000100009c04, + 0x2c0000010800dc04, + 0x9400000020009ca5, + 0x9400000100011cc5, + 0x9400000140021cc5, + 0x9400000180031cc5, + 0x94000001c0041cc5, + 0x9400000200051cc5, + 0x9400000240061cc5, + 0x9400000280071cc5, + 0x94000002c0081cc5, + 0x9400000300091cc5, + 0x94000003400a1cc5, + 0x94000003800b1cc5, + 0x94000003c00c1cc5, + 0x94000004000d1cc5, + 0x94000004400e1cc5, + 0x94000004800f1cc5, + 0xc000000003f09ea5, + 0x94000000c0009ca5, + 0xc000000023f09ea5, + 0x94000000e0009ca5, + 0x2c00000084009c04, + 0x2c0000008800dc04, + 0x9400000040009ca5, + 0x2c0000008c009c04, + 0x2c0000009400dc04, + 0x9400000060009ca5, + 0x2c00000098009c04, + 0x2c0000009c00dc04, + 0x9400000080009ca5, + 0x2c000000c800dc04, + 0x0c0000001030dd02, + 0x4000000100001ea7, + 0x480100000c001c03, + 0x0800000000105c42, +/* 0x10d8: shared_loop */ + 0xc100000000309c85, + 0x9400000500009c85, + 0x0c00000010001d02, + 0x0800000000105d42, + 0x0c0000001030dd02, + 0x4003ffff40001ca7, +/* 0x1108: shared_done */ + 0x2800406420001de4, + 0x2800406430005de4, + 0xe000000000001c45, + 0xd000000003ffdcc5, + 0x9c000000000fdcc5, + 0x2c0000000c009c04, + 0x7000c0205020dc03, + 0x7000c01820209c03, + 0x5000406450209c03, + 0x500040644030dc03, + 0x480000000c209c03, + 0x4801000008001c03, + 0x0800000000105c42, +/* 0x1170: search_cstack */ + 0x280040646000dde4, + 0x8400000020009f05, + 0x190ec0002821dc03, + 0x40000000800001e7, + 0x0c00000040001c02, + 0x0800000000105c42, + 0x0c0000004030dd02, + 0x00029dff0ffc5cbf, +/* 0x11b0: entry_found */ + 0x8400000000009f85, + 0x2800406400001de4, + 0x2800406410005de4, + 0x9400000010009c85, + 0x4000000000001df4, +/* 0x11d8: end_exit */ + 0x9800000003ffdcc5, + 0xd000000000008007, + 0xa000000000004007, +/* 0x11f0: end_cont */ + 0xd000000000008007, + 0x3400c3fffc201c04, + 0xc000000003f01ec5, + 0xa000000000000007, +}; + +uint64_t gk104_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000f0, + 0x0000000000000f08, + 0x0000000000000f18, +}; diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm new file mode 100644 index 00000000000..be17871edd4 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -0,0 +1,98 @@ +.section #gk110_builtin_code +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +gk110_div_u32: + sched 0x28282804280428 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + sched 0x28282828282828 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x042c2828042804 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + sched 0x20282e20042c28 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + ret + +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +gk110_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + sched 0x28042804282820 + cvt s32 $r0 abs s32 $r0 + cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + sched 0x28282828282828 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x28280428042828 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + sched 0x2028042c28042c + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p3 cvt s32 $r0 neg s32 $r0 + sched 0x2c200428042e04 + $p2 cvt s32 $r1 neg s32 $r1 + ret + +gk110_rcp_f64: +gk110_rsq_f64: + ret + +.section #gk110_builtin_offsets +.b64 #gk110_div_u32 +.b64 #gk110_div_s32 +.b64 #gk110_rcp_f64 +.b64 #gk110_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h new file mode 100644 index 00000000000..8d00e2a2245 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -0,0 +1,81 @@ +uint64_t gk110_builtin_code[] = { +/* 0x0000: gk110_div_u32 */ + 0x08a0a0a010a010a0, + 0xe1800000009c000a, + 0x220000000f9c0808, + 0x74000000009fc00e, + 0xe2400000011c0c0a, + 0xe6010000009c2806, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0x08a0a0a0a0a0a0a0, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0x0810b0a0a010a010, + 0xd2000800019c080a, + 0xe4c03c00001c000e, + 0xe1c00400011c0002, + 0xe6010000009c280a, + 0xd0000c00001c0406, + 0xdb601c00011c041e, + 0xe088000001000406, + 0x0880a0b88010b0a0, + 0x4000000000800001, + 0xdb601c000100041e, + 0xe088000001000406, + 0x4000000000800001, + 0x19000000001c003c, +/* 0x00f0: gk110_div_s32 */ + 0xdb181c007f9c005e, + 0xdb1a08007f9c047e, + 0x08a010a010a0a080, + 0xe6100000001ce802, + 0xe6100000009ce806, + 0xe1800000009c000a, + 0x220000000f9c0808, + 0x74000000009fc00e, + 0xe2400000011c0c0a, + 0xe6010000009c2806, + 0x08a0a0a0a0a0a0a0, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0x08a0a010a010a0a0, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe4c03c00001c000e, + 0xe1c00400011c0002, + 0xe6010000009c280a, + 0xd0000c00001c0406, + 0x0880a010b0a010b0, + 0xdb601c00011c041e, + 0xe088000001000406, + 0x4000000000800001, + 0xdb601c000100041e, + 0xe088000001000406, + 0x4000000000800001, + 0xe6010000000ce802, + 0x08b08010a010b810, + 0xe60100000088e806, + 0x19000000001c003c, +/* 0x0218: gk110_rcp_f64 */ +/* 0x0218: gk110_rsq_f64 */ + 0x19000000001c003c, +}; + +uint64_t gk110_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000f0, + 0x0000000000000218, + 0x0000000000000218, +}; |