summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlia Mirkin <[email protected]>2014-03-12 12:00:58 -0400
committerIlia Mirkin <[email protected]>2014-03-18 05:56:54 -0400
commit48a9ba63f5c9751052e472f8d7fb195ce874199d (patch)
tree3fda8977d0ca0432fabe0242d542d097bb0e8770
parent4bb14aca293b12cbe4f2352fb11c20091876c1cf (diff)
nv50/ir/gk110: add implementations of div u32/s32
Signed-off-by: Ilia Mirkin <[email protected]>
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm86
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h81
2 files changed, 162 insertions, 5 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm
new file mode 100644
index 00000000000..a0c5497524a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm
@@ -0,0 +1,86 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE: 22 / 14 * 8 bytes
+//
+sched 0x28282804280428
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+sched 0x28282828282828
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x042c2828042804
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+sched 0x20282e20042c28
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+sched 0x28042804282820
+cvt s32 $r0 abs s32 $r0
+cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+sched 0x28282828282828
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x28280428042828
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+sched 0x2028042c28042c
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p3 cvt s32 $r0 neg s32 $r0
+sched 0x2c200428042e04
+$p2 cvt s32 $r1 neg s32 $r1
+ret
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
index d10b6b07693..02c1ec646aa 100644
--- a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
@@ -1,13 +1,84 @@
+// Assembled from target_lib_nvf0.asm by envyas -m gk110 -W.
+
static const uint64_t nvf0_builtin_code[] =
{
- 0x19000000001c003cULL,
+// DIV U32
+0x08a0a0a010a010a0ULL,
+0xe1800000009c000aULL,
+0x220000000f9c0808ULL,
+0x74000000009fc00eULL,
+0xe2400000011c0c0aULL,
+0xe6010000009c2806ULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0x08a0a0a0a0a0a0a0ULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0x0810b0a0a010a010ULL,
+0xd2000800019c080aULL,
+0xe4c03c00001c000eULL,
+0xe1c00400011c0002ULL,
+0xe6010000009c280aULL,
+0xd0000c00001c0406ULL,
+0xdb601c00011c041eULL,
+0xe088000001000406ULL,
+0x0880a0b88010b0a0ULL,
+0x4000000000800001ULL,
+0xdb601c000100041eULL,
+0xe088000001000406ULL,
+0x4000000000800001ULL,
+0x19000000001c003cULL,
+// DIV S32
+0xdb181c007f9c005eULL,
+0xdb1a08007f9c047eULL,
+0x08a010a010a0a080ULL,
+0xe6100000001ce802ULL,
+0xe6100000009ce806ULL,
+0xe1800000009c000aULL,
+0x220000000f9c0808ULL,
+0x74000000009fc00eULL,
+0xe2400000011c0c0aULL,
+0xe6010000009c2806ULL,
+0x08a0a0a0a0a0a0a0ULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0x08a0a010a010a0a0ULL,
+0xd2000800019c080aULL,
+0xe1c00000011c040eULL,
+0xd2000800019c080aULL,
+0xe4c03c00001c000eULL,
+0xe1c00400011c0002ULL,
+0xe6010000009c280aULL,
+0xd0000c00001c0406ULL,
+0x0880a010b0a010b0ULL,
+0xdb601c00011c041eULL,
+0xe088000001000406ULL,
+0x4000000000800001ULL,
+0xdb601c000100041eULL,
+0xe088000001000406ULL,
+0x4000000000800001ULL,
+0xe6010000000ce802ULL,
+0x08b08010a010b810ULL,
+0xe60100000088e806ULL,
+0x19000000001c003cULL,
};
static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] =
{
- 0,
- 0,
- 0,
- 0
+ 0x0000,
+ 0x00f0,
+ /* Just point at a ret instruction for now. */
+ 0x00f0 - 8,
+ 0x00f0 - 8
};