From 8656d9f4d89834664eed21aa36ba74a1413f857a Mon Sep 17 00:00:00 2001
From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com>
Date: Tue, 30 Jan 2018 16:47:28 -0700
Subject: [PATCH 1/5] Correct millis() drift, issue 3078

---
 cores/esp8266/core_esp8266_wiring.c | 129 ++++++++++++++++++++++++++--
 1 file changed, 124 insertions(+), 5 deletions(-)

diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c
index 4b55a711e0..47f6e018c4 100644
--- a/cores/esp8266/core_esp8266_wiring.c
+++ b/cores/esp8266/core_esp8266_wiring.c
@@ -61,11 +61,130 @@ void micros_overflow_tick(void* arg) {
     micros_at_last_overflow_tick = m;
 }
 
-unsigned long ICACHE_RAM_ATTR millis() {
-    uint32_t m = system_get_time();
-    uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick) ? 1 : 0);
-    return c * 4294967 + m / 1000;
-}
+//---------------------------------------------------------------------------
+// millis() 'magic multiplier' approximation
+//
+// This function corrects the cumlative (296us / usec overflow) drift
+// seen in the orignal 'millis()' function.
+//
+// Input:
+//    'm' - 32-bit usec counter,           0 <= m <= 0xFFFFFFFF
+//    'c' - 32-bit usec overflow counter   0 <= c <  0x00400000
+// Output:
+//    Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
+//
+// Notes:
+//
+// 1) This routine approximates the 64-bit integer division,
+//
+//    quotient =  ( 2^32 c + m ) / 1000,
+//
+//    through the use of 'magic' multipliers. A slow division is replaced by
+//    a faster multiply using a scaled multiplicative inverse of the divisor:
+//
+//    quotient =~ ( 2^32 c + m ) * k,  where k = Ceiling[ 2^n / 1000 ]
+//
+//    The precision difference between multiplier and divisor sets the
+//    upper-bound of the dividend which can be successfully divided.
+//
+//    For this application, n = 64, and the divisor (1000) has 10-bits of 
+//    precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
+//    and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
+//    for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
+//
+// 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
+//
+//      prd = (2^32 kh + kl) * ( 2^32 c + m )
+//          = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
+//               (d)         (c)         (b)       (a)
+//
+//    Graphically, the offset-sums align in little endian like this:
+//                  LS -> MS
+//            32       64       96      128
+//    | a[-1]  |  a[0]  |  a[1]  |  a[2]  |
+//    |       m kl      |    0   |    0   |  a[-1] not needed
+//    |        |       m kh      |        |
+//    |        |       c kl      |        |  a[1] holds the result
+//    |        |        |       c kh      |  a[2] can be discarded
+//
+//    As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
+//    overall result, only (2) 32-bit words are needed for the accumulator.
+//
+// 3) As C++ does not intrinsically test for addition overflows, one must
+//    to code specifically to detect them. This approximation skips these
+//    overflow checks for speed, hence the sum,
+//
+//    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
+//
+//    To meet this criteria, not only do we have to pick 'k' to achieve our
+//    desired precision, we also have to split 'k' appropriately to avoid
+//    any addition overflows.
+//
+//    'k' should be also chosen to align the various products on byte
+//    boundaries to avoid any 64-bit shifts before additions, as they incur
+//    major time penalties. The 'k' chosen for this specific division by 1000
+//    was picked primarily to avoid shifts as well as for precision.
+//
+//    For the reasons list above, this routine is NOT a general one.
+//    Changing divisors could break the overflow requirement and force
+//    picking a 'k' split which requires shifts before additions.
+//
+//              ** Test THOROUGHLY after making changes **
+//
+// 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
+//
+//         usec   x Orig   Comment
+// Orig:   3.18   1.00     Original code
+// Corr:  13.21   4.15     64-bit reference code
+// Test:   4.60   1.45     64-bit magic multiply, 4x32
+//
+// The magic multiplier routine runs ~3x faster than the reference. Execution
+// times can vary considerably with the numbers being multiplied, so one
+// should derate this factor to around 2x, worst case.
+//
+//   Reference function: corrected millis(), 64-bit arithmetic,
+//                       truncated to 32-bits by return
+//   unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
+//   {
+//     // Get usec system time, usec overflow conter
+//     ......
+//     return ( (c * 4294967296 + m) / 1000 );  // 64-bit division is SLOW
+//   } //millis_corr
+//
+// 5) See this link for a good discussion on magic multipliers:
+//    http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+
+#define  MAGIC_1E3_wLO  0x4bc6a7f0    // LS part
+#define  MAGIC_1E3_wHI  0x00418937    // MS part, magic multiplier
+
+unsigned long ICACHE_RAM_ATTR millis()
+{
+  uint32_t  a[2];  // Accumulator, little endian
+  a[1] = 0;        // Zero high-acc
+
+  // Get usec system time, usec overflow counter
+  uint32_t  m = system_get_time();
+  uint32_t  c = micros_overflow_count +
+                   ((m < micros_at_last_overflow_tick) ? 1 : 0);
+
+  // (a) Init. low-acc with high-word of 1st product. The right-shift
+  //     falls on a byte boundary, hence is relatively quick.
+  ((uint64_t *)(&a[0]))[0]  =
+     ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
+     ( m * (uint64_t)MAGIC_1E3_wHI );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
+     ( c * (uint64_t)MAGIC_1E3_wLO );
+  
+  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
+     (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
+
+  return ( a[1] );  // Extract result, high-acc
+
+} //millis
 
 unsigned long ICACHE_RAM_ATTR micros() {
     return system_get_time();

From 56e13f41729c97cc6cd1cf1279f44c0d505194ce Mon Sep 17 00:00:00 2001
From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com>
Date: Thu, 1 Feb 2018 12:28:52 -0700
Subject: [PATCH 2/5] Add 'test_millis_mm.ino' runtime benchmark

---
 cores/esp8266/core_esp8266_wiring.c           |   2 +-
 .../device/test_millis_mm/test_millis_mm.ino  | 484 ++++++++++++++++++
 2 files changed, 485 insertions(+), 1 deletion(-)
 create mode 100644 tests/device/test_millis_mm/test_millis_mm.ino

diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c
index 47f6e018c4..68ead05c6d 100644
--- a/cores/esp8266/core_esp8266_wiring.c
+++ b/cores/esp8266/core_esp8266_wiring.c
@@ -111,7 +111,7 @@ void micros_overflow_tick(void* arg) {
 //    overall result, only (2) 32-bit words are needed for the accumulator.
 //
 // 3) As C++ does not intrinsically test for addition overflows, one must
-//    to code specifically to detect them. This approximation skips these
+//    code specifically to detect them. This approximation skips these
 //    overflow checks for speed, hence the sum,
 //
 //    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
diff --git a/tests/device/test_millis_mm/test_millis_mm.ino b/tests/device/test_millis_mm/test_millis_mm.ino
new file mode 100644
index 0000000000..eaca0fe8b3
--- /dev/null
+++ b/tests/device/test_millis_mm/test_millis_mm.ino
@@ -0,0 +1,484 @@
+//
+// Millis() Runtime Benchmarke
+//
+//  Code to determine the runtime in 'usec' of various millis()
+//  functions.
+//
+
+#include <Arduino.h>
+#include <ESP8266WiFi.h>
+#include <stdio.h>
+#include <BSTest.h>
+
+// Include API-Headers
+extern "C" {                  // SDK functions for Arduino IDE access
+#include "osapi.h"
+#include "user_interface.h"
+#include "espconn.h"
+}  //end, 'extern C'
+
+//---------------------------------------------------------------------------
+// Globals
+//---------------------------------------------------------------------------
+
+//                 Here                    // In 'core_esp8266_wiring.c'
+static os_timer_t  us_ovf_timer;           // 'micros_overflow_timer'
+static uint32_t    us_cnt = 0;             // 'm' in 'micros_overflow_tick()'
+static uint32_t    us_ovflow = 0;          // 'micros_overflow_count'
+static uint32_t    us_at_last_ovf = 0;     // 'micros_at_last_overflow_tick'
+
+static bool        fixed_systime = false;  // Testing vars
+static bool        debugf = false;
+static uint32_t    us_systime = 0;
+static float       nsf = 0;                // Normalization factor
+static uint32_t    cntref = 0;             // Ref. comparision count
+
+//---------------------------------------------------------------------------
+// Interrupt code lifted directly from "cores/core_esp8266_wiring.c",
+// with some variable renaming
+//---------------------------------------------------------------------------
+
+// Callback for usec counter overflow timer interrupt
+void  us_overflow_tick ( void* arg )
+{
+  us_cnt = system_get_time();
+
+  // Check for usec counter overflow
+  if ( us_cnt < us_at_last_ovf ) {
+    ++us_ovflow;
+  } //end-if
+  us_at_last_ovf = us_cnt;
+
+} //us_overflow_tick
+
+//---------------------------------------------------------------------------
+#define  REPEAT 1
+#define  ONCE   0
+
+void us_count_init ( void )
+{
+  os_timer_setfn( &us_ovf_timer, (os_timer_func_t*)&us_overflow_tick, 0 );
+  os_timer_arm( &us_ovf_timer, 60000, REPEAT );
+
+} //us_count_init
+
+//---------------------------------------------------------------------------
+// Wrapper(s) for our benchmark
+//---------------------------------------------------------------------------
+// Set a fixed value of usec system time
+void  set_systime ( uint32_t usec )
+{
+  us_systime = usec;
+} //set_systime
+
+//---------------------------------------------------------------------------
+// Wrapper to return a fixed system time
+uint32_t system_get_timeA ( void )
+{
+  return ( fixed_systime ? us_systime : system_get_time() );
+} //system_get_timeA
+
+//---------------------------------------------------------------------------
+// Functions to be tested
+//---------------------------------------------------------------------------
+// Print integer list as hex
+void viewhex( uint16_t *p, uint8_t n )
+{
+  Serial.print( "0x" );
+  for ( uint8_t i = 0; i < n; i++ )
+  {
+    Serial.printf( "%04X ", p[ (n - 1) - i ] );
+  }
+
+} //viewhex
+
+//---------------------------------------------------------------------------
+// Support routine for 'millis_test_DEBUG()'
+// Print accumulator value along interm summed into it
+void view_accsum ( const char *desc, uint16_t *acc, uint16_t *itrm )
+{
+   Serial.print( "acc:  " );
+   viewhex( acc, 4 );
+   Serial.printf( "  %s = ", desc );
+   viewhex( itrm, 4 );
+   Serial.println();
+   
+} //view_accsum
+
+//---------------------------------------------------------------------------
+// FOR BENCHTEST
+// Original millis() function
+unsigned long ICACHE_RAM_ATTR millis_orig ( void )
+{
+  // Get usec system time, usec overflow conter
+  uint32_t  m = system_get_time();
+  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
+
+  return ( (c * 4294967) + m / 1000 );
+
+} //millis_orig
+
+//---------------------------------------------------------------------------
+// FOR DEBUG
+// Corrected millis(), 64-bit arithmetic gold standard
+// truncated to 32-bits by return
+unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
+{
+  // Get usec system time, usec overflow conter
+  uint32_t  m = system_get_timeA();   // DEBUG
+  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
+
+  return ( (c * 4294967296 + m) / 1000 );
+
+} //millis_corr_DEBUG
+
+//---------------------------------------------------------------------------
+// FOR BENCHMARK
+unsigned long ICACHE_RAM_ATTR millis_corr ( void )
+{
+  // Get usec system time, usec overflow conter
+  uint32_t  m = system_get_time();
+  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
+
+  return ( (c * 4294967296 + m) / 1000 );
+
+} //millis_corr
+
+//---------------------------------------------------------------------------
+// FOR DEBUG
+// millis() 'magic multiplier' approximation
+//
+// This function corrects the cumlative (296us / usec overflow) drift
+// seen in the orignal 'millis()' function.
+//
+// Input:
+//    'm' - 32-bit usec counter,           0 <= m <= 0xFFFFFFFF
+//    'c' - 32-bit usec overflow counter   0 <= c <  0x00400000
+// Output:
+//    Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
+//
+// Notes:
+//
+// 1) This routine approximates the 64-bit integer division,
+//
+//    quotient =  ( 2^32 c + m ) / 1000,
+//
+//    through the use of 'magic' multipliers. A slow division is replaced by
+//    a faster multiply using a scaled multiplicative inverse of the divisor:
+//
+//    quotient =~ ( 2^32 c + m ) * k,  where k = Ceiling[ 2^n / 1000 ]
+//
+//    The precision difference between multiplier and divisor sets the
+//    upper-bound of the dividend which can be successfully divided.
+//
+//    For this application, n = 64, and the divisor (1000) has 10-bits of 
+//    precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
+//    and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
+//    for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
+//
+// 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
+//
+//      prd = (2^32 kh + kl) * ( 2^32 c + m )
+//          = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
+//               (d)         (c)         (b)       (a)
+//
+//    Graphically, the offset-sums align in little endian like this:
+//                  LS -> MS
+//            32       64       96      128
+//    | a[-1]  |  a[0]  |  a[1]  |  a[2]  |
+//    |       m kl      |    0   |    0   |  a[-1] not needed
+//    |        |       m kh      |        |
+//    |        |       c kl      |        |  a[1] holds the result
+//    |        |        |       c kh      |  a[2] can be discarded
+//
+//    As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
+//    overall result, only (2) 32-bit words are needed for the accumulator.
+//
+// 3) As C++ does not intrinsically test for addition overflows, one must
+//    code specifically to detect them. This approximation skips these
+//    overflow checks for speed, hence the sum,
+//
+//    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
+//
+//    To meet this criteria, not only do we have to pick 'k' to achieve our
+//    desired precision, we also have to split 'k' appropriately to avoid
+//    any addition overflows.
+//
+//    'k' should be also chosen to align the various products on byte
+//    boundaries to avoid any 64-bit shifts before additions, as they incur
+//    major time penalties. The 'k' chosen for this specific division by 1000
+//    was picked primarily to avoid shifts as well as for precision.
+//
+//    For the reasons list above, this routine is NOT a general one.
+//    Changing divisors could break the overflow requirement and force
+//    picking a 'k' split which requires shifts before additions.
+//
+//              ** Test THOROUGHLY after making changes **
+//
+// 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
+//
+//         usec   x Orig   Comment
+// Orig:   3.18   1.00     Original code
+// Corr:  13.21   4.15     64-bit reference code
+// Test:   4.60   1.45     64-bit magic multiply, 4x32
+//
+// The magic multiplier routine runs ~3x faster than the reference. Execution
+// times can vary considerably with the numbers being multiplied, so one
+// should derate this factor to around 2x, worst case.
+//
+//   Reference function: corrected millis(), 64-bit arithmetic,
+//                       truncated to 32-bits by return
+//   unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
+//   {
+//     // Get usec system time, usec overflow conter
+//     ......
+//     return ( (c * 4294967296 + m) / 1000 );  // 64-bit division is SLOW
+//   } //millis_corr
+//
+// 5) See this link for a good discussion on magic multipliers:
+//    http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+
+#define  M_USEC_MAX     0xFFFFFFFF
+#define  C_COUNT_MAX    0x00400000
+
+#define  MAGIC_1E3_wLO  0x4bc6a7f0    // LS part
+#define  MAGIC_1E3_wHI  0x00418937    // MS part, magic multiplier
+
+unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void )
+{
+  uint32_t  a[2];  // Accumulator, little endian
+  a[1] = 0;        // Zero high-acc
+  uint64_t  prd;   // Interm product
+  
+  // Get usec system time, usec overflow counter
+  uint32_t  m = system_get_timeA();
+  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
+
+  // DEBUG: Show input vars
+  if ( debugf )
+     Serial.printf( "Test  m: 0x%08X  c: 0x%08X\n", m, c );
+
+  // (a) Init. low-acc with high-word of 1st product. The right-shift
+  //     falls on a byte boundary, hence is relatively quick.
+  ((uint64_t *)(&a[0]))[0]  =
+     ( (prd = (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO )) >> 32 );
+
+  // DEBUG: Show both accumulator and interm product
+  if( debugf )
+     view_accsum( "m kl", (uint16_t *)&a[0], (uint16_t *)&prd );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
+     ( prd = ( m * (uint64_t)MAGIC_1E3_wHI ) );
+
+  // DEBUG: Show both accumulator and interm product
+  if( debugf )
+     view_accsum( "m kh", (uint16_t *)&a[0], (uint16_t *)&prd );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
+     ( prd = ( c * (uint64_t)MAGIC_1E3_wLO ) );
+
+  // DEBUG: Show both accumulator and interm product
+  if( debugf )
+     view_accsum( "c kl", (uint16_t *)&a[0], (uint16_t *)&prd );
+  
+  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
+     (uint32_t)( prd = ( c * (uint64_t)MAGIC_1E3_wHI ) );
+
+  // DEBUG: Show both accumulator and interm product
+  if( debugf )
+     view_accsum( "c kh", (uint16_t *)&a[0], (uint16_t *)&prd );
+
+  return ( a[1] );  // Extract result, high-acc
+
+} //millis_test_DEBUG
+
+//---------------------------------------------------------------------------
+// FOR BENCHTEST
+unsigned long ICACHE_RAM_ATTR millis_test ( void )
+{
+  uint32_t  a[2];  // Accumulator, little endian
+  a[1] = 0;        // Zero high-acc
+
+  // Get usec system time, usec overflow counter
+  uint32_t  m = system_get_time();
+  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
+
+  // (a) Init. low-acc with high-word of 1st product. The right-shift
+  //     falls on a byte boundary, hence is relatively quick.
+  ((uint64_t *)(&a[0]))[0]  =
+     ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
+     ( m * (uint64_t)MAGIC_1E3_wHI );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
+     ( c * (uint64_t)MAGIC_1E3_wLO );
+  
+  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
+     (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
+
+  return ( a[1] );  // Extract result, high-acc
+
+} //millis_test
+
+//---------------------------------------------------------------------------
+// Execution time benchmark
+//---------------------------------------------------------------------------
+
+// Print benchmark result
+void  millis_rtms_print ( const char *pream, uint32_t cntx, const char *desc )
+{
+  Serial.print( pream );
+  Serial.print( nsf * (float)cntx );
+  Serial.print( "   " );
+  Serial.print( (float)cntx / (float)cntref );
+  Serial.print( "     " );  
+  Serial.println( desc );
+  
+} //millis_rtms_print
+
+//---------------------------------------------------------------------------
+void  Millis_RunTimes ( void )
+{
+  Serial.println();
+  Serial.println( "Millis() RunTime Benchmarks" );
+
+  uint32_t lc = 100000;          // Samples
+  nsf = 1 / float(lc);           // Normalization (global)
+ 
+  uint32_t bgn;
+  uint32_t cnto  = 0, cntv = 0;
+  uint32_t cntcx = 0, cntc = 0;
+  uint32_t cntfx = 0, cntf = 0;
+
+  // Setup timer values
+  fixed_systime = true; 
+  us_ovflow      =  C_COUNT_MAX;
+  us_at_last_ovf = (M_USEC_MAX - (20 * 1000000));   // Max. less 20 sec
+
+// No printing, systime active
+  debugf = false; fixed_systime = false;
+
+  for (uint32_t i = 0; i < lc; i++ )
+  {
+    bgn = system_get_time();
+    millis_orig();      
+    cnto += system_get_time() - bgn;
+
+    bgn = system_get_time();
+    millis();      
+    cntv += system_get_time() - bgn;
+
+    bgn = system_get_time();
+    millis_corr_DEBUG();
+    cntcx += system_get_time() - bgn;
+        
+    bgn = system_get_time();
+    millis_corr();
+    cntc += system_get_time() - bgn;
+
+    bgn = system_get_time();
+    millis_test_DEBUG();
+    cntfx += system_get_time() - bgn;
+
+    bgn = system_get_time();
+    millis_test();
+    cntf += system_get_time() - bgn;
+
+    yield();
+  } //end-for
+
+  cntref = cnto;  // Set global ref. count
+
+  Serial.println();
+  Serial.println( "         usec   x Orig   Comment" );
+
+  millis_rtms_print( " Orig:   ",  cntref, "Original code" ); 
+  millis_rtms_print( " Core:   ",  cntv,   "Current  core" ); 
+  Serial.println();
+
+  millis_rtms_print( "  Ref:  ",   cntcx,  "64-bit reference code,      DEBUG" );
+  millis_rtms_print( " Test:   ",  cntfx,  "64-bit magic multiply, 4x32 DEBUG" );  
+  Serial.println();
+
+  millis_rtms_print( "  Ref:  ",   cntc,   "64-bit reference code" );  
+  millis_rtms_print( " Test:   ",  cntf,   "64-bit magic multiply, 4x32" );
+  Serial.println();
+
+  Serial.println( F("*** End, Bench Test ***") );
+  Serial.println();
+
+} //Millis_RunTimes
+
+//---------------------------------------------------------------------------
+// Debug millis_test()
+//---------------------------------------------------------------------------
+
+bool  Debug_Millis_Test ( void )
+{
+  uint32_t  m, msc, mstx;
+  int32_t   diff;
+
+// Switch over to fixed system time, enable printing
+  fixed_systime = true;  debugf = true;
+  
+  us_ovflow    = C_COUNT_MAX;
+  m            = (M_USEC_MAX - (0 * 1000000));
+  set_systime( m );
+  us_at_last_ovf = m - 1;  // Disables 'c' bump
+
+// Millis() comparison, test vs. reference
+  Serial.println();
+  mstx = millis_test_DEBUG();
+  msc  = millis_corr_DEBUG();
+  diff = (int32_t)(mstx - msc);
+
+  Serial.println();
+  Serial.println( "         m         Test    Reference   Difference" );
+  Serial.printf( "0x%08x   0x%08x   0x%08X    %9d\n", m, mstx, msc, diff );
+  Serial.println();
+
+// No printing, variable systime
+  debugf = false; fixed_systime = false;
+
+  return( (bool)( diff == 0 ) );   // Good test, matches reference
+
+} //Debug_Millis_Test
+
+//---------------------------------------------------------------------------
+//---------------------------------------------------------------------------
+
+BS_ENV_DECLARE();
+
+void setup ()
+{
+  Serial.begin(115200);
+  WiFi.mode( WIFI_OFF );
+  us_count_init();        // Start up timer overflow sampling  
+  BS_RUN(Serial);
+
+} //setup
+
+//---------------------------------------------------------------------------
+void loop(void)
+{
+   yield();
+} //loop
+
+//---------------------------------------------------------------------------
+// Test cases
+//---------------------------------------------------------------------------
+TEST_CASE( "Millis RunTime Benchmarks", "[bs]" )
+{
+    Millis_RunTimes();
+} //testcase1
+
+TEST_CASE( "Debug 'millis_test()' Code", "[bs]" )
+{
+    bool ok = Debug_Millis_Test();
+    CHECK( ok );
+} //testcase2
+
+//---------------------------------------------------------------------------
+//---------------------------------------------------------------------------

From b1ae5d475ab5e4950b178e690e00284dedeed211 Mon Sep 17 00:00:00 2001
From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com>
Date: Thu, 8 Mar 2018 19:52:26 -0700
Subject: [PATCH 3/5] Eliminate 'punning' warning

Add union 'acc' in millis() to eliminate 'punning' compiler warning
---
 cores/esp8266/core_esp8266_wiring.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c
index 68ead05c6d..be53be05a7 100644
--- a/cores/esp8266/core_esp8266_wiring.c
+++ b/cores/esp8266/core_esp8266_wiring.c
@@ -160,9 +160,12 @@ void micros_overflow_tick(void* arg) {
 
 unsigned long ICACHE_RAM_ATTR millis()
 {
-  uint32_t  a[2];  // Accumulator, little endian
-  a[1] = 0;        // Zero high-acc
-
+  union {
+     uint64_t  q;     // Accumulator, 64-bit, little endian
+     uint32_t  a[2];  // ..........., 32-bit  segments
+  } acc;
+  acc.a[1] = 0;      // Zero high-acc
+  
   // Get usec system time, usec overflow counter
   uint32_t  m = system_get_time();
   uint32_t  c = micros_overflow_count +
@@ -170,19 +173,19 @@ unsigned long ICACHE_RAM_ATTR millis()
 
   // (a) Init. low-acc with high-word of 1st product. The right-shift
   //     falls on a byte boundary, hence is relatively quick.
-  ((uint64_t *)(&a[0]))[0]  =
-     ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
+  
+  acc.q  = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
 
-  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
-     ( m * (uint64_t)MAGIC_1E3_wHI );
+  // (b) Offset sum, low-acc
+  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );\
 
-  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
-     ( c * (uint64_t)MAGIC_1E3_wLO );
-  
-  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
-     (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
+  // (c) Offset sum, low-acc
+  acc.q += ( c * (uint64_t)MAGIC_1E3_wLO );
+
+  // (d) Truncated sum, high-acc
+  acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
 
-  return ( a[1] );  // Extract result, high-acc
+  return ( acc.a[1] );  // Extract result, high-acc
 
 } //millis
 

From 442c526fd2c161180a78d91258e571565c6b4bb1 Mon Sep 17 00:00:00 2001
From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com>
Date: Thu, 8 Mar 2018 19:54:20 -0700
Subject: [PATCH 4/5] Correct minor typo

---
 cores/esp8266/core_esp8266_wiring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c
index be53be05a7..3d91043d34 100644
--- a/cores/esp8266/core_esp8266_wiring.c
+++ b/cores/esp8266/core_esp8266_wiring.c
@@ -164,7 +164,7 @@ unsigned long ICACHE_RAM_ATTR millis()
      uint64_t  q;     // Accumulator, 64-bit, little endian
      uint32_t  a[2];  // ..........., 32-bit  segments
   } acc;
-  acc.a[1] = 0;      // Zero high-acc
+  acc.a[1] = 0;       // Zero high-acc
   
   // Get usec system time, usec overflow counter
   uint32_t  m = system_get_time();
@@ -177,7 +177,7 @@ unsigned long ICACHE_RAM_ATTR millis()
   acc.q  = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
 
   // (b) Offset sum, low-acc
-  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );\
+  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );
 
   // (c) Offset sum, low-acc
   acc.q += ( c * (uint64_t)MAGIC_1E3_wLO );

From fb41bd3b5a02d7c95788e76b464508660814bddf Mon Sep 17 00:00:00 2001
From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com>
Date: Thu, 8 Mar 2018 20:07:48 -0700
Subject: [PATCH 5/5] Eliminate 'punning' warning

 Add union 'acc' to eliminate 'punning' compiler warning  in 'millis_test'_DEBUG() and 'millis_test()'
---
 .../device/test_millis_mm/test_millis_mm.ino  | 63 ++++++++++---------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/tests/device/test_millis_mm/test_millis_mm.ino b/tests/device/test_millis_mm/test_millis_mm.ino
index eaca0fe8b3..4bc4c78e2e 100644
--- a/tests/device/test_millis_mm/test_millis_mm.ino
+++ b/tests/device/test_millis_mm/test_millis_mm.ino
@@ -247,9 +247,13 @@ unsigned long ICACHE_RAM_ATTR millis_corr ( void )
 
 unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void )
 {
-  uint32_t  a[2];  // Accumulator, little endian
-  a[1] = 0;        // Zero high-acc
-  uint64_t  prd;   // Interm product
+  union {
+    uint64_t  q;     // Accumulator, 64-bit, little endian
+    uint32_t  a[2];  // ..........., 32-bit  segments
+  } acc;
+  acc.a[1] = 0;      // Zero high-acc
+  
+  uint64_t  prd;     // Interm product
   
   // Get usec system time, usec overflow counter
   uint32_t  m = system_get_timeA();
@@ -261,35 +265,35 @@ unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void )
 
   // (a) Init. low-acc with high-word of 1st product. The right-shift
   //     falls on a byte boundary, hence is relatively quick.
-  ((uint64_t *)(&a[0]))[0]  =
-     ( (prd = (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO )) >> 32 );
+
+  acc.q  = ( (prd = (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO )) >> 32 );
 
   // DEBUG: Show both accumulator and interm product
   if( debugf )
-     view_accsum( "m kl", (uint16_t *)&a[0], (uint16_t *)&prd );
+     view_accsum( "m kl", (uint16_t *)&acc.q, (uint16_t *)&prd );
 
-  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
-     ( prd = ( m * (uint64_t)MAGIC_1E3_wHI ) );
+  // (b) Offset sum, low-acc
+  acc.q += ( prd = ( m * (uint64_t)MAGIC_1E3_wHI ) );
 
   // DEBUG: Show both accumulator and interm product
   if( debugf )
-     view_accsum( "m kh", (uint16_t *)&a[0], (uint16_t *)&prd );
+     view_accsum( "m kh", (uint16_t *)&acc.q, (uint16_t *)&prd );
 
-  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
-     ( prd = ( c * (uint64_t)MAGIC_1E3_wLO ) );
+  // (c) Offset sum, low-acc
+  acc.q += ( prd = ( c * (uint64_t)MAGIC_1E3_wLO ) );
 
   // DEBUG: Show both accumulator and interm product
   if( debugf )
-     view_accsum( "c kl", (uint16_t *)&a[0], (uint16_t *)&prd );
+     view_accsum( "c kl", (uint16_t *)&acc.q, (uint16_t *)&prd );
   
-  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
-     (uint32_t)( prd = ( c * (uint64_t)MAGIC_1E3_wHI ) );
+  // (d) Truncated sum, high-acc
+  acc.a[1] += (uint32_t)( prd = ( c * (uint64_t)MAGIC_1E3_wHI ) );
 
   // DEBUG: Show both accumulator and interm product
   if( debugf )
-     view_accsum( "c kh", (uint16_t *)&a[0], (uint16_t *)&prd );
+     view_accsum( "c kh", (uint16_t *)&acc.q, (uint16_t *)&prd );
 
-  return ( a[1] );  // Extract result, high-acc
+  return ( acc.a[1] );  // Extract result, high-acc
 
 } //millis_test_DEBUG
 
@@ -297,8 +301,11 @@ unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void )
 // FOR BENCHTEST
 unsigned long ICACHE_RAM_ATTR millis_test ( void )
 {
-  uint32_t  a[2];  // Accumulator, little endian
-  a[1] = 0;        // Zero high-acc
+  union {
+    uint64_t  q;     // Accumulator, 64-bit, little endian
+    uint32_t  a[2];  // ..........., 32-bit  segments
+  } acc;
+  acc.a[1] = 0;      // Zero high-acc
 
   // Get usec system time, usec overflow counter
   uint32_t  m = system_get_time();
@@ -306,19 +313,19 @@ unsigned long ICACHE_RAM_ATTR millis_test ( void )
 
   // (a) Init. low-acc with high-word of 1st product. The right-shift
   //     falls on a byte boundary, hence is relatively quick.
-  ((uint64_t *)(&a[0]))[0]  =
-     ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
+  
+  acc.q  = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
 
-  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
-     ( m * (uint64_t)MAGIC_1E3_wHI );
+  // (b) Offset sum, low-acc
+  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );
 
-  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
-     ( c * (uint64_t)MAGIC_1E3_wLO );
-  
-  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
-     (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
+  // (c) Offset sum, low-acc
+  acc.q += ( c * (uint64_t)MAGIC_1E3_wLO );
+
+  // (d) Truncated sum, high-acc
+  acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
 
-  return ( a[1] );  // Extract result, high-acc
+  return ( acc.a[1] );  // Extract result, high-acc
 
 } //millis_test