@@ -61,11 +61,133 @@ void micros_overflow_tick(void* arg) {
6161 micros_at_last_overflow_tick = m ;
6262}
6363
64- unsigned long ICACHE_RAM_ATTR millis () {
65- uint32_t m = system_get_time ();
66- uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick ) ? 1 : 0 );
67- return c * 4294967 + m / 1000 ;
68- }
64+ //---------------------------------------------------------------------------
65+ // millis() 'magic multiplier' approximation
66+ //
67+ // This function corrects the cumlative (296us / usec overflow) drift
68+ // seen in the orignal 'millis()' function.
69+ //
70+ // Input:
71+ // 'm' - 32-bit usec counter, 0 <= m <= 0xFFFFFFFF
72+ // 'c' - 32-bit usec overflow counter 0 <= c < 0x00400000
73+ // Output:
74+ // Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
75+ //
76+ // Notes:
77+ //
78+ // 1) This routine approximates the 64-bit integer division,
79+ //
80+ // quotient = ( 2^32 c + m ) / 1000,
81+ //
82+ // through the use of 'magic' multipliers. A slow division is replaced by
83+ // a faster multiply using a scaled multiplicative inverse of the divisor:
84+ //
85+ // quotient =~ ( 2^32 c + m ) * k, where k = Ceiling[ 2^n / 1000 ]
86+ //
87+ // The precision difference between multiplier and divisor sets the
88+ // upper-bound of the dividend which can be successfully divided.
89+ //
90+ // For this application, n = 64, and the divisor (1000) has 10-bits of
91+ // precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
92+ // and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
93+ // for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
94+ //
95+ // 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
96+ //
97+ // prd = (2^32 kh + kl) * ( 2^32 c + m )
98+ // = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
99+ // (d) (c) (b) (a)
100+ //
101+ // Graphically, the offset-sums align in little endian like this:
102+ // LS -> MS
103+ // 32 64 96 128
104+ // | a[-1] | a[0] | a[1] | a[2] |
105+ // | m kl | 0 | 0 | a[-1] not needed
106+ // | | m kh | |
107+ // | | c kl | | a[1] holds the result
108+ // | | | c kh | a[2] can be discarded
109+ //
110+ // As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
111+ // overall result, only (2) 32-bit words are needed for the accumulator.
112+ //
113+ // 3) As C++ does not intrinsically test for addition overflows, one must
114+ // code specifically to detect them. This approximation skips these
115+ // overflow checks for speed, hence the sum,
116+ //
117+ // highword( m kl ) + m kh + c kl < (2^64-1), MUST NOT OVERFLOW.
118+ //
119+ // To meet this criteria, not only do we have to pick 'k' to achieve our
120+ // desired precision, we also have to split 'k' appropriately to avoid
121+ // any addition overflows.
122+ //
123+ // 'k' should be also chosen to align the various products on byte
124+ // boundaries to avoid any 64-bit shifts before additions, as they incur
125+ // major time penalties. The 'k' chosen for this specific division by 1000
126+ // was picked primarily to avoid shifts as well as for precision.
127+ //
128+ // For the reasons list above, this routine is NOT a general one.
129+ // Changing divisors could break the overflow requirement and force
130+ // picking a 'k' split which requires shifts before additions.
131+ //
132+ // ** Test THOROUGHLY after making changes **
133+ //
134+ // 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
135+ //
136+ // usec x Orig Comment
137+ // Orig: 3.18 1.00 Original code
138+ // Corr: 13.21 4.15 64-bit reference code
139+ // Test: 4.60 1.45 64-bit magic multiply, 4x32
140+ //
141+ // The magic multiplier routine runs ~3x faster than the reference. Execution
142+ // times can vary considerably with the numbers being multiplied, so one
143+ // should derate this factor to around 2x, worst case.
144+ //
145+ // Reference function: corrected millis(), 64-bit arithmetic,
146+ // truncated to 32-bits by return
147+ // unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
148+ // {
149+ // // Get usec system time, usec overflow conter
150+ // ......
151+ // return ( (c * 4294967296 + m) / 1000 ); // 64-bit division is SLOW
152+ // } //millis_corr
153+ //
154+ // 5) See this link for a good discussion on magic multipliers:
155+ // http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
156+ //
157+
158+ #define MAGIC_1E3_wLO 0x4bc6a7f0 // LS part
159+ #define MAGIC_1E3_wHI 0x00418937 // MS part, magic multiplier
160+
161+ unsigned long ICACHE_RAM_ATTR millis ()
162+ {
163+ union {
164+ uint64_t q ; // Accumulator, 64-bit, little endian
165+ uint32_t a [2 ]; // ..........., 32-bit segments
166+ } acc ;
167+ acc .a [1 ] = 0 ; // Zero high-acc
168+
169+ // Get usec system time, usec overflow counter
170+ uint32_t m = system_get_time ();
171+ uint32_t c = micros_overflow_count +
172+ ((m < micros_at_last_overflow_tick ) ? 1 : 0 );
173+
174+ // (a) Init. low-acc with high-word of 1st product. The right-shift
175+ // falls on a byte boundary, hence is relatively quick.
176+
177+ acc .q = ( (uint64_t )( m * (uint64_t )MAGIC_1E3_wLO ) >> 32 );
178+
179+ // (b) Offset sum, low-acc
180+ acc .q += ( m * (uint64_t )MAGIC_1E3_wHI );
181+
182+ // (c) Offset sum, low-acc
183+ acc .q += ( c * (uint64_t )MAGIC_1E3_wLO );
184+
185+ // (d) Truncated sum, high-acc
186+ acc .a [1 ] += (uint32_t )( c * (uint64_t )MAGIC_1E3_wHI );
187+
188+ return ( acc .a [1 ] ); // Extract result, high-acc
189+
190+ } //millis
69191
70192unsigned long ICACHE_RAM_ATTR micros () {
71193 return system_get_time ();
0 commit comments