I'm not quite sure what you mean by one LSB, but if you meant one ULP
(unit in last place) of a float, you'd be OK so long as you did the
computation in a double. In you're converting to an int, you should
be OK doing the initial calculation with doubles as well. My error
estimate was that you'd be looking as something like 200 ULPs worst
case (IOW, of a ~53 bit double, about 45 bits would be correct).
If you mean to one ULP of a *double*, life gets much more difficult.
>I'm mobile and don't have the code in front of me, IIRC the full
> function is something like MAGIC_NUMŨ(1 - e^(-3x)).
>
>Input will actually be floats, with a domain between 0 and 1.0 at
> 256 steps. The result will then be shifted and cast to a unit 16
> full with fractional representaton and stored.
And you're trying to precompute those ~257 values? Something like
this?
#include <iostream>
#define LIMIT 1000000.
constexpr double zexp_w(double x, double x2, double d, double d2)
{
return ((x2/d2)<(1/LIMIT)) ? 0 : (x2/d2 + zexp_w(x, x*x2, d+1,
d2*d));
}
constexpr double zexp(double x)
{
return x<0 ? 1/zexp_w(-x, 1, 1, 1) : zexp_w(x, 1, 1, 1);
}
inline void test(double x)
{
double t;
t=zexp(x);
std::cout << "e**" << x << " = " << t << std::endl;
}
#define MAGIC_NUM 6.28
constexpr float ff(double x)
{
return MAGIC_NUM*(1-zexp(-3*(x/256)));
}
float a[257] =
{
ff(0), ff(1), ff(2), ff(3), ff(4), ff(5), ff(6), ff(7),
ff(8), ff(9),
ff(10), ff(11), ff(12), ff(13), ff(14), ff(15), ff(16), ff(17),
ff(18), ff(19),
ff(20), ff(21), ff(22), ff(23), ff(24), ff(25), ff(26), ff(27),
ff(28), ff(29),
ff(30), ff(31), ff(32), ff(33), ff(34), ff(35), ff(36), ff(37),
ff(38), ff(39),
ff(40), ff(41), ff(42), ff(43), ff(44), ff(45), ff(46), ff(47),
ff(48), ff(49),
ff(50), ff(51), ff(52), ff(53), ff(54), ff(55), ff(56), ff(57),
ff(58), ff(59),
ff(60), ff(61), ff(62), ff(63), ff(64), ff(65), ff(66), ff(67),
ff(68), ff(69),
ff(70), ff(71), ff(72), ff(73), ff(74), ff(75), ff(76), ff(77),
ff(78), ff(79),
ff(80), ff(81), ff(82), ff(83), ff(84), ff(85), ff(86), ff(87),
ff(88), ff(89),
ff(90), ff(91), ff(92), ff(93), ff(94), ff(95), ff(96), ff(97),
ff(98), ff(99),
ff(100),ff(101),ff(102),ff(103),ff(104),ff(105),ff(106),ff(107),ff(108),ff(109),
ff(110),ff(111),ff(112),ff(113),ff(114),ff(115),ff(116),ff(117),ff(118),ff(119),
ff(120),ff(121),ff(122),ff(123),ff(124),ff(125),ff(126),ff(127),ff(128),ff(129),
ff(130),ff(131),ff(132),ff(133),ff(134),ff(135),ff(136),ff(137),ff(138),ff(139),
ff(140),ff(141),ff(142),ff(143),ff(144),ff(145),ff(146),ff(147),ff(148),ff(149),
ff(150),ff(151),ff(152),ff(153),ff(154),ff(155),ff(156),ff(157),ff(158),ff(159),
ff(160),ff(161),ff(162),ff(163),ff(164),ff(165),ff(166),ff(167),ff(168),ff(169),
ff(170),ff(171),ff(172),ff(173),ff(174),ff(175),ff(176),ff(177),ff(178),ff(179),
ff(180),ff(181),ff(182),ff(183),ff(184),ff(185),ff(186),ff(187),ff(188),ff(189),
ff(190),ff(191),ff(192),ff(193),ff(194),ff(195),ff(196),ff(197),ff(198),ff(199),
ff(200),ff(201),ff(202),ff(203),ff(204),ff(205),ff(206),ff(207),ff(208),ff(209),
ff(210),ff(211),ff(212),ff(213),ff(214),ff(215),ff(216),ff(217),ff(218),ff(219),
ff(220),ff(221),ff(222),ff(223),ff(224),ff(225),ff(226),ff(227),ff(228),ff(229),
ff(230),ff(231),ff(232),ff(233),ff(234),ff(235),ff(236),ff(237),ff(238),ff(239),
ff(240),ff(241),ff(242),ff(243),ff(244),ff(245),ff(246),ff(247),ff(248),ff(249),
ff(250),ff(251),ff(252),ff(253),ff(254),ff(255),ff(256)
};
int main(void)
{
int i;
test(0);
test(1);
test(1.5);
test(2);
test(3.14);
test(42);
test(-1);
test(-3.14);
test(19.4);
for (i=0; i<257; i++)
std::cout << MAGIC_NUM << "*(1-e**(" << (-3*(i/256.)) << ") = "
<< a[i] << std::endl;
}
(Again, you'll need to generate LIMIT better.)
Output:
e**0 = 1
e**1 = 2.71828
e**1.5 = 4.48169
e**2 = 7.38906
e**3.14 = 23.1039
e**42 = 1.73927e+18
e**-1 = 0.367879
e**-3.14 = 0.0432828
e**19.4 = 2.66264e+08
6.28*(1-e**(-0) = 0
6.28*(1-e**(-0.0117188) = 0.0731626
6.28*(1-e**(-0.0234375) = 0.145476
6.28*(1-e**(-0.0351563) = 0.216945
6.28*(1-e**(-0.046875) = 0.287581
6.28*(1-e**(-0.0585938) = 0.357393
6.28*(1-e**(-0.0703125) = 0.426396
6.28*(1-e**(-0.0820313) = 0.494593
6.28*(1-e**(-0.09375) = 0.561995
6.28*(1-e**(-0.105469) = 0.628611
6.28*(1-e**(-0.117188) = 0.694451
...
6.28*(1-e**(-2.87109) = 5.92432
6.28*(1-e**(-2.88281) = 5.92846
6.28*(1-e**(-2.89453) = 5.93256
6.28*(1-e**(-2.90625) = 5.93661
6.28*(1-e**(-2.91797) = 5.94061
6.28*(1-e**(-2.92969) = 5.94456
6.28*(1-e**(-2.94141) = 5.94847
6.28*(1-e**(-2.95313) = 5.95233
6.28*(1-e**(-2.96484) = 5.95615
6.28*(1-e**(-2.97656) = 5.95992
6.28*(1-e**(-2.98828) = 5.96365
6.28*(1-e**(-3) = 5.96734