From 7b4c86b03eceb1fdb6e0bb8e85160dac8ba6a24a Mon Sep 17 00:00:00 2001
From: Jacob Pratt <jacob@jhpratt.dev>
Date: Thu, 19 Mar 2026 00:45:06 -0400
Subject: [PATCH] Optimize 128-bit integer formatting

The compiler is unaware of the restricted range of the input, so it is
unable to optimize out the final division and modulus. By doing this
manually, we get a nontrivial performance gain.
---
 src/lib.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/src/lib.rs b/src/lib.rs
index 884193e..fbe3918 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -410,7 +410,7 @@ fn enc_16lsd<const OFFSET: usize>(buf: &mut [MaybeUninit<u8>], n: u64) {
     let mut remain = n;
 
     // Format per four digits from the lookup table.
-    for quad_index in (0..4).rev() {
+    for quad_index in (1..4).rev() {
         // pull two pairs
         let quad = remain % 1_00_00;
         remain /= 1_00_00;
@@ -426,6 +426,15 @@ fn enc_16lsd<const OFFSET: usize>(buf: &mut [MaybeUninit<u8>], n: u64) {
                 .write(*DECIMAL_PAIRS.0.get_unchecked(pair2 as usize * 2 + 1));
         }
     }
+
+    // final two pairs
+    let (pair1, pair2) = divmod100(remain as u32);
+    unsafe {
+        buf[OFFSET + 0].write(*DECIMAL_PAIRS.0.get_unchecked(pair1 as usize * 2 + 0));
+        buf[OFFSET + 1].write(*DECIMAL_PAIRS.0.get_unchecked(pair1 as usize * 2 + 1));
+        buf[OFFSET + 2].write(*DECIMAL_PAIRS.0.get_unchecked(pair2 as usize * 2 + 0));
+        buf[OFFSET + 3].write(*DECIMAL_PAIRS.0.get_unchecked(pair2 as usize * 2 + 1));
+    }
 }
 
 // Euclidean division plus remainder with constant 1E16 basically consumes 16