# Copyright 2010 Luca Barbieri
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice (including the
# next paragraph) shall be included in all copies or substantial
# portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# *************************************************************************

# The code is a reimplementation of the algorithm in
#  www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
# "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
#
# The table contents have been slightly changed so that the exponent
# bias is now in the exponent table instead of the mantissa table (mostly
# for cosmetic reasons, and because it theoretically allows a variant
# that flushes denormal to zero but uses a mantissa table with 24-bit
# entries).
#
# The tables are also constructed slightly differently.
#

# Note that using a 64K * 4 table is a terrible idea since it will not fit
# in the L1 cache and will massively pollute the L2 cache as well
#
# These should instead fit in the L1 cache.
#
# TODO: we could use a denormal bias table instead of the mantissa/offset
# tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
# but would involve more computation
#
# Note however that if denormals are never encountered, the L1 cache usage
# is only about 4608 bytes anyway.

table_index = None
table_length = None

def begin(t, n, l):
	global table_length
	global table_index
	table_index = 0
	table_length = l
	print
	print "const " + t + " " + n + "[" + str(l) + "] = {"

def value(v):
	global table_index
	table_index += 1
	print "\t" + hex(v) + ","

def end():
	global table_length
	global table_index
	print "};"
	assert table_index == table_length

print "/* This file is autogenerated by u_half.py. Do not edit directly. */"
print "#include \"util/u_half.h\""

begin("uint32_t", "util_half_to_float_mantissa_table", 2048)
# zero
value(0)

# denormals
for i in xrange(1, 1024):
	m = i << 13
	e = 0

	# normalize number
	while (m & 0x00800000) == 0:
		e -= 0x00800000
		m <<= 1

	m &= ~0x00800000
	e += 0x38800000
	value(m | e)

# normals
for i in xrange(1024, 2048):
	value((i - 1024) << 13)
end()

begin("uint32_t", "util_half_to_float_exponent_table", 64)
# positive zero or denormals
value(0)

# positive numbers
for i in xrange(1, 31):
	value(0x38000000 + (i << 23))

# positive infinity/NaN
value(0x7f800000)

# negative zero or denormals
value(0x80000000)

# negative numbers
for i in range(33, 63):
	value(0xb8000000 + ((i - 32) << 23))

# negative infinity/NaN
value(0xff800000)
end()

begin("uint32_t", "util_half_to_float_offset_table", 64)
# positive zero or denormals
value(0)

# positive normals
for i in range(1, 32):
	value(1024)

# negative zero or denormals
value(0)

# negative normals
for i in xrange(33, 64):
	value(1024)
end()

begin("uint16_t", "util_float_to_half_base_table", 512)
for sign in (0, 0x8000):
	# very small numbers mapping to zero
	for i in xrange(-127, -24):
		value(sign | 0)

	# small numbers mapping to denormals
	for i in xrange(-24, -14):
		value(sign | (0x400 >> (-14 -i)))

	# normal numbers
	for i in xrange(-14, 16):
		value(sign | ((i + 15) << 10))

	# large numbers mapping to infinity
	for i in xrange(16, 128):
		value(sign | 0x7c00)

	# infinity and NaNs
	value(sign | 0x7c00)
end()

begin("uint8_t", "util_float_to_half_shift_table", 512)
for sign in (0, 0x8000):
	# very small numbers mapping to zero
	for i in xrange(-127, -24):
		value(24)

	# small numbers mapping to denormals
	for i in xrange(-24, -14):
		value(-1 - i)

	# normal numbers
	for i in xrange(-14, 16):
		value(13)

	# large numbers mapping to infinity
	for i in xrange(16, 128):
		value(24)

	# infinity and NaNs
	value(13)
end()